load data
load("../tables/fileS4.RData")
load("../rdas/HCR.protein.TMM.RData")
expressed.gene.names <- as.character(HCR.protein.TMM.norm.ESNGlabeled[rownames(HCR.protein.TMM.norm.ESNGlabeled) %in% rownames(protein.expressed.data),16])
names(expressed.gene.names) <- rownames(protein.expressed.data)
phenotype specific divergence (i.e. buffering at the downstream phenotype) regress out effects from downstream phenotype (pairwise comparison)
library(limma)
## Warning: package 'limma' was built under R version 3.4.2
library(qvalue)
# test for translation buffering
# HvC
RNA.expressed.data.HC<-RNA.expressed.data[,1:10]
ribo.expressed.data.HC<-ribo.expressed.data[,1:10]
RNA.expressed.weights.HC <- RNA.expressed.weights[,1:10]
# for each gene fit RNA with ribo data and take residue
RNA.specific.data <- matrix(nrow = length(RNA.expressed.data.HC[,1]),ncol = 10)
colnames(RNA.specific.data) <- colnames(RNA.expressed.data.HC)
rownames(RNA.specific.data) <- rownames(RNA.expressed.data.HC)
for (i in 1:length(RNA.expressed.data.HC[,1])){
ribo.fit<-lm(unlist(RNA.expressed.data.HC[i,]) ~ unlist(ribo.expressed.data.HC[i,]),weights = RNA.expressed.weights.HC[i,])
RNA.specific.data[i,] <- ribo.fit$residuals
}
species.label <- substring(colnames(RNA.expressed.data.HC),1,1)
design <- model.matrix(~species.label)
colnames(design)<-c("Int","Human")
# fit residual RNA effect with a species fix effect model and include voom weights calculated from RNA seq data
HvC.RNA.specific.fit <- lmFit(RNA.specific.data ,design = design,weights = RNA.expressed.weights.HC)
# use eBayes to further adjust variance and compute DE test statistics
HvC.RNA.specific.fit <- eBayes(HvC.RNA.specific.fit)
HvC.translation.buffering.pval <- HvC.RNA.specific.fit$p.value[,2]
# test for post-translation (protein) buffering
# HvC
protein.expressed.data.HC<-protein.expressed.data[,1:10]
ribo.expressed.data.HC<-ribo.expressed.data[,1:10]
ribo.expressed.weights.HC <- ribo.expressed.weights[,1:10]
# for each gene fit ribo with protein data and take residue
ribo.specific.data <- matrix(nrow = length(ribo.expressed.data.HC[,1]),ncol = 10)
colnames(ribo.specific.data) <- colnames(ribo.expressed.data.HC)
rownames(ribo.specific.data) <- rownames(ribo.expressed.data.HC)
for (i in 1:length(ribo.expressed.data.HC[,1])){
protein.fit<-lm(unlist(ribo.expressed.data.HC[i,]) ~ unlist(protein.expressed.data.HC[i,]), na.action = "na.exclude",weights = ribo.expressed.weights.HC[i,])
ribo.specific.data[i,] <- protein.fit$residuals[match(colnames(ribo.specific.data), names(protein.fit$residuals))]
}
species.label <- substring(colnames(ribo.expressed.data.HC),1,1)
design <- model.matrix(~species.label)
colnames(design)<-c("Int","Human")
# fit residual ribo effect with a species fix effect model and include voom weights calculated from ribo seq data
HvC.ribo.specific.fit <- lmFit(ribo.specific.data ,design = design,weights = ribo.expressed.weights.HC)
# use eBayes to further adjust variance and compute DE test statistics
HvC.ribo.specific.fit <- eBayes(HvC.ribo.specific.fit)
HvC.protein.buffering.pval <- HvC.ribo.specific.fit$p.value[,2]
RvC
# test for translation buffering
# RvC
RNA.expressed.data.RC<-RNA.expressed.data[,c(1:5,11:15)]
ribo.expressed.data.RC<-ribo.expressed.data[,c(1:5,11:15)]
RNA.expressed.weights.RC <- RNA.expressed.weights[,c(1:5,11:15)]
# for each gene fit RNA with ribo data and take residue
RNA.specific.data <- matrix(nrow = length(RNA.expressed.data.RC[,1]),ncol = 10)
colnames(RNA.specific.data) <- colnames(RNA.expressed.data.RC)
rownames(RNA.specific.data) <- rownames(RNA.expressed.data.RC)
for (i in 1:length(RNA.expressed.data.RC[,1])){
ribo.fit<-lm(unlist(RNA.expressed.data.RC[i,]) ~ unlist(ribo.expressed.data.RC[i,]),weights = RNA.expressed.weights.RC[i,])
RNA.specific.data[i,] <- ribo.fit$residuals
}
species.label <- substring(colnames(RNA.expressed.data.RC),1,1)
design <- model.matrix(~species.label)
colnames(design)<-c("Int","Rhesus")
# fit residual RNA effect with a species fix effect model and include voom weights calculated from RNA seq data
RvC.RNA.specific.fit <- lmFit(RNA.specific.data ,design = design,weights = RNA.expressed.weights.RC)
# use eBayes to further adjust variance and compute DE test statistics
RvC.RNA.specific.fit <- eBayes(RvC.RNA.specific.fit)
RvC.translation.buffering.pval <- RvC.RNA.specific.fit$p.value[,2]
# test for post-translation (protein) buffering
# RvC
protein.expressed.data.RC<-protein.expressed.data[,c(1:5,11:15)]
ribo.expressed.data.RC<-ribo.expressed.data[,c(1:5,11:15)]
ribo.expressed.weights.RC <- ribo.expressed.weights[,c(1:5,11:15)]
# for each gene fit ribo with protein data and take residue
ribo.specific.data <- matrix(nrow = length(ribo.expressed.data.RC[,1]),ncol = 10)
colnames(ribo.specific.data) <- colnames(ribo.expressed.data.RC)
rownames(ribo.specific.data) <- rownames(ribo.expressed.data.RC)
for (i in 1:length(ribo.expressed.data.RC[,1])){
protein.fit<-lm(unlist(ribo.expressed.data.RC[i,]) ~ unlist(protein.expressed.data.RC[i,]), na.action = "na.exclude",weights = ribo.expressed.weights.RC[i,])
ribo.specific.data[i,] <- protein.fit$residuals[match(colnames(ribo.specific.data), names(protein.fit$residuals))]
}
species.label <- substring(colnames(ribo.expressed.data.RC),1,1)
design <- model.matrix(~species.label)
colnames(design)<-c("Int","Rhesus")
# fit residual ribo effect with a species fix effect model and include voom weights calculated from ribo seq data
RvC.ribo.specific.fit <- lmFit(ribo.specific.data ,design = design,weights = ribo.expressed.weights.RC)
# use eBayes to further adjust variance and compute DE test statistics
RvC.ribo.specific.fit <- eBayes(RvC.ribo.specific.fit)
RvC.protein.buffering.pval <- RvC.ribo.specific.fit$p.value[,2]
RvH
# test for translation buffering
# RvH
RNA.expressed.data.RH<-RNA.expressed.data[,6:15]
ribo.expressed.data.RH<-ribo.expressed.data[,6:15]
RNA.expressed.weights.RH <- RNA.expressed.weights[,6:15]
# for each gene fit RNA with ribo data and take residue
RNA.specific.data <- matrix(nrow = length(RNA.expressed.data.RH[,1]),ncol = 10)
colnames(RNA.specific.data) <- colnames(RNA.expressed.data.RH)
rownames(RNA.specific.data) <- rownames(RNA.expressed.data.RH)
for (i in 1:length(RNA.expressed.data.RH[,1])){
ribo.fit<-lm(unlist(RNA.expressed.data.RH[i,]) ~ unlist(ribo.expressed.data.RH[i,]),weights = RNA.expressed.weights.RH[i,])
RNA.specific.data[i,] <- ribo.fit$residuals
}
species.label <- substring(colnames(RNA.expressed.data.RH),1,1)
design <- model.matrix(~species.label)
colnames(design)<-c("Int","Rhesus")
# fit residual RNA effect with a species fix effect model and include voom weights calculated from RNA seq data
RvH.RNA.specific.fit <- lmFit(RNA.specific.data ,design = design,weights = RNA.expressed.weights.RH)
# use eBayes to further adjust variance and compute DE test statistics
RvH.RNA.specific.fit <- eBayes(RvH.RNA.specific.fit)
RvH.translation.buffering.pval <- RvH.RNA.specific.fit$p.value[,2]
# test for post-translation (protein) buffering
# RvH
protein.expressed.data.RH<-protein.expressed.data[,6:15]
ribo.expressed.data.RH<-ribo.expressed.data[,6:15]
ribo.expressed.weights.RH <- ribo.expressed.weights[,6:15]
# for each gene fit ribo with protein data and take residue
ribo.specific.data <- matrix(nrow = length(ribo.expressed.data.RH[,1]),ncol = 10)
colnames(ribo.specific.data) <- colnames(ribo.expressed.data.RH)
rownames(ribo.specific.data) <- rownames(ribo.expressed.data.RH)
for (i in 1:length(ribo.expressed.data.RH[,1])){
protein.fit<-lm(unlist(ribo.expressed.data.RH[i,]) ~ unlist(protein.expressed.data.RH[i,]), na.action = "na.exclude",weights = ribo.expressed.weights.RH[i,])
ribo.specific.data[i,] <- protein.fit$residuals[match(colnames(ribo.specific.data), names(protein.fit$residuals))]
}
species.label <- substring(colnames(ribo.expressed.data.RH),1,1)
design <- model.matrix(~species.label)
colnames(design)<-c("Int","Rhesus")
# fit residual ribo effect with a species fix effect model and include voom weights calculated from ribo seq data
RvH.ribo.specific.fit <- lmFit(ribo.specific.data ,design = design,weights = ribo.expressed.weights.RH)
# use eBayes to further adjust variance and compute DE test statistics
RvH.ribo.specific.fit <- eBayes(RvH.ribo.specific.fit)
RvH.protein.buffering.pval <- RvH.ribo.specific.fit$p.value[,2]
######
#save translational buffering results to file S6
buffer.results <- cbind(HvC.RNA.specific.fit$coefficients[,2], RvC.RNA.specific.fit$coefficients[,2], RvH.RNA.specific.fit$coefficients[,2], HvC.translation.buffering.pval, RvC.translation.buffering.pval, RvH.translation.buffering.pval, qvalue(HvC.translation.buffering.pval)$qvalues, qvalue(RvC.translation.buffering.pval)$qvalues, qvalue(RvH.translation.buffering.pval)$qvalues, p.adjust(HvC.translation.buffering.pval,method = "bonferroni"), p.adjust(RvC.translation.buffering.pval,method = "bonferroni"), p.adjust(RvH.translation.buffering.pval,method = "bonferroni"))
colnames(buffer.results) <- c("HvC.beta","RvC.beta","HvR.beta","HvC.pval","RvC.pval","HvR.pval","HvC.FDR","RvC.FDR","HvR.FDR","HvC.FWER","RvC.FWER","HvR.FWER")
#write.csv(buffer.results,"../../tables/FileS6.csv",quote = FALSE,row.names = TRUE)
######
######
#save post-translational buffering results to file S7
buffer.results <- cbind(HvC.ribo.specific.fit$coefficients[,2], RvC.ribo.specific.fit$coefficients[,2], RvH.ribo.specific.fit$coefficients[,2], HvC.protein.buffering.pval, RvC.protein.buffering.pval, RvH.protein.buffering.pval, qvalue(HvC.protein.buffering.pval)$qvalues, qvalue(RvC.protein.buffering.pval)$qvalues, qvalue(RvH.protein.buffering.pval)$qvalues, p.adjust(HvC.protein.buffering.pval,method = "bonferroni"), p.adjust(RvC.protein.buffering.pval,method = "bonferroni"), p.adjust(RvH.protein.buffering.pval,method = "bonferroni"))
colnames(buffer.results) <- c("HvC.beta","RvC.beta","HvR.beta","HvC.pval","RvC.pval","HvR.pval","HvC.FDR","RvC.FDR","HvR.FDR","HvC.FWER","RvC.FWER","HvR.FWER")
#write.csv(buffer.results,"../../tables/FileS7.csv",quote = FALSE,row.names = TRUE)
######
estimate gene expression divergecen between specise for each phenotype (for plots)
# RNA
species.label <- substring(colnames(RNA.expressed.data),1,1)
design <- model.matrix(~species.label)
colnames(design)<-c("Int","Human","Rhesus")
RNA.voom.fit <- lmFit(RNA.expressed.data,design = design,weights = RNA.expressed.weights)
RH.contrast <- makeContrasts(Rhesus-Human,levels = design)
RNA.voom.RH.fit <- contrasts.fit(RNA.voom.fit,RH.contrast)
RNA.voom.fit <- eBayes(RNA.voom.fit)
RNA.voom.RH.fit <- eBayes(RNA.voom.RH.fit)
HvC.RNA.effect<-RNA.voom.fit$coefficient[,2]
RvC.RNA.effect<-RNA.voom.fit$coefficient[,3]
RvH.RNA.effect <- RNA.voom.RH.fit$coefficient[,1]
# ribo
species.label <- substring(colnames(ribo.expressed.data),1,1)
design <- model.matrix(~species.label)
colnames(design)<-c("Int","Human","Rhesus")
ribo.voom.fit <- lmFit(ribo.expressed.data,design = design,weights = ribo.expressed.weights)
RH.contrast <- makeContrasts(Rhesus-Human,levels = design)
ribo.voom.RH.fit <- contrasts.fit(ribo.voom.fit,RH.contrast)
ribo.voom.fit <- eBayes(ribo.voom.fit)
ribo.voom.RH.fit <- eBayes(ribo.voom.RH.fit)
HvC.ribo.effect<-ribo.voom.fit$coefficient[,2]
RvC.ribo.effect<-ribo.voom.fit$coefficient[,3]
RvH.ribo.effect <- ribo.voom.RH.fit$coefficient[,1]
# protein
species.label <- substring(colnames(protein.expressed.data),1,1)
design <- model.matrix(~species.label)
colnames(design)<-c("Int","Human","Rhesus")
protein.fit <- lmFit(protein.expressed.data,design = design)
RH.contrast <- makeContrasts(Rhesus-Human,levels = design)
protein.RH.fit <- contrasts.fit(protein.fit,RH.contrast)
protein.fit <- eBayes(protein.fit)
protein.RH.fit <- eBayes(protein.RH.fit)
HvC.protein.effect<-protein.fit$coefficient[,2]
RvC.protein.effect<-protein.fit$coefficient[,3]
RvH.protein.effect <- protein.RH.fit$coefficient[,1]
scatter plots for visualize results
library(ggplot2)
# human vs chimp translation buffering plot
HvC.FWER<-p.adjust(HvC.translation.buffering.pval,method = "bonferroni")
length(which(HvC.FWER < 0.05))
## [1] 1
HvC.qval<- qvalue(HvC.translation.buffering.pval)$qvalues
length(which(HvC.qval < 0.01))
## [1] 1
# color by FWER value < 0.05
point.color <- cut(HvC.FWER,breaks = c (0,0.05,1), labels = c("blue","grey50"))
#point.color <- cut(HvC.qval,breaks = c (0,0.01,1), labels = c("blue","grey50"))
F3a <- ggplot(mapping = aes(x=HvC.RNA.effect,y=HvC.ribo.effect))+geom_point(size=1, alpha=0.5, col=point.color)+ xlab("log2(human/chimpanzee) RNA")+ylab("log2(human/chimpanzee) RPF")+ggtitle("Translational buffering")+scale_x_continuous(limits = c(-10, 10))+scale_y_continuous(limits = c(-10, 10))+theme_bw()+geom_abline(slope = 1, intercept = 0)+geom_point(aes(x= HvC.RNA.effect[HvC.FWER<0.05],y= HvC.ribo.effect[HvC.FWER<0.05]), col="blue", alpha=0.5)
F3a
######
#Fig3a
#pdf("../../figures/Fig3a.pdf", width = 4, height = 4)
#F3a
#dev.off()
######
# human vs chimp protein buffering plot
HvC.FWER<-p.adjust(HvC.protein.buffering.pval,method = "bonferroni")
length(which(HvC.FWER < 0.05))
## [1] 35
HvC.qval<- qvalue(HvC.protein.buffering.pval)$qvalues
length(which(HvC.qval < 0.01))
## [1] 126
# color by FWER value < 0.05
point.color <- cut(HvC.FWER,breaks = c (0,0.05,1), labels = c("blue","grey50"))
#point.color <- cut(HvC.qval,breaks = c (0,0.01,1), labels = c("blue","grey50"))
F3b <- ggplot(mapping = aes(x=HvC.ribo.effect,y=HvC.protein.effect))+geom_point(size=1, alpha=0.5, col=point.color)+ xlab("log2(human/chimpanzee) RPF")+ylab("log2(human/chimpanzee) protein")+ggtitle("Post-translational buffering")+scale_x_continuous(limits = c(-10, 10))+scale_y_continuous(limits = c(-10, 10))+theme_bw()+geom_abline(slope = 1, intercept = 0)
F3b
######
#Fig3b
#pdf("../../figures/Fig3b.pdf", width = 4, height = 4)
#F3b
#dev.off()
######
# p value qqplot
HvC.translation.buffering.sorted.logP<--log10(sort(HvC.translation.buffering.pval))
HvC.protein.buffering.sorted.logP<--log10(sort(HvC.protein.buffering.pval))
expected <- c(1:length(HvC.protein.buffering.sorted.logP))
log.exp <- -(log10(expected / (length(expected)+1)))
plot(c(0,7), c(0,7), col="red", lwd=2, type="l", xlab="Expected (-log10 Pvalue)", ylab="Observed (-log10 Pvalue)", xlim=c(0,4), ylim=c(0,20), las=1, xaxs="i", yaxs="i", bty="l", main="human vs chimpanzee gene expression buffering")
points(log.exp, HvC.translation.buffering.sorted.logP, pch=18, cex=.6, col="orange")
points(log.exp, HvC.protein.buffering.sorted.logP, pch=18, cex=.6, col="blue")
legend("topleft",c("translational","post-translational"),col = c("orange","blue"),pch=18,cex=.6, bty="n", horiz=TRUE)
######
#Fig3c
#pdf("../../figures/Fig3c.pdf", width = 4, height = 4)
#plot(c(0,7), c(0,7), col="red", lwd=2, type="l", xlab="Expected (-log10 p-value)", ylab="Observed (-log10 p-value)", xlim=c(0,4), ylim=c(0,20), las=1, xaxs="i", yaxs="i", bty="l", main="Gene expression buffering")
#points(log.exp, HvC.translation.buffering.sorted.logP, pch=18, cex=.6, col="orange")
#points(log.exp, HvC.protein.buffering.sorted.logP, pch=18, cex=.6, col="blue")
#legend("topleft",c("translational","post-translational"),col = c("orange","blue"),pch=18,cex=.75, bty="n", horiz=TRUE)
#dev.off()
######
scatter plots for visualize results RvC (supplement)
library(ggplot2)
# rhesus vs chimp translation buffering plot
RvC.FWER<-p.adjust(RvC.translation.buffering.pval,method = "bonferroni")
length(which(RvC.FWER < 0.05))
## [1] 9
RvC.qval<- qvalue(RvC.translation.buffering.pval)$qvalues
length(which(RvC.qval < 0.01))
## [1] 26
# color by FWER value < 0.05
point.color <- cut(RvC.FWER,breaks = c (0,0.05,1), labels = c("blue","grey50"))
#point.color <- cut(RvC.qval,breaks = c (0,0.01,1), labels = c("blue","grey50"))
F3S1a <- ggplot(mapping = aes(x=RvC.RNA.effect,y=RvC.ribo.effect))+geom_point(size=1, alpha=0.5, col=point.color)+ xlab("log2(rhesus/chimpanzee) RNA")+ylab("log2(rhesus/chimpanzee) RPF")+ggtitle("RvC: Translational buffering")+scale_x_continuous(limits = c(-10, 10))+scale_y_continuous(limits = c(-10, 10))+theme_bw()+geom_abline(slope = 1, intercept = 0)
F3S1a
## Warning: Removed 1 rows containing missing values (geom_point).
######
#Fig3_S1a
#pdf("../../figures/Fig3S1a.pdf", width = 4, height = 4)
#F3S1a
#dev.off()
######
# rhesus vs chimp protein buffering plot
RvC.FWER<-p.adjust(RvC.protein.buffering.pval,method = "bonferroni")
length(which(RvC.FWER < 0.05))
## [1] 45
RvC.qval<- qvalue(RvC.protein.buffering.pval)$qvalues
length(which(RvC.qval < 0.01))
## [1] 168
# color by FWER value < 0.01
point.color <- cut(RvC.FWER,breaks = c (0,0.05,1), labels = c("blue","grey50"))
#point.color <- cut(RvC.qval,breaks = c (0,0.01,1), labels = c("blue","grey50"))
F3S1b <- ggplot(mapping = aes(x=RvC.ribo.effect,y=RvC.protein.effect))+geom_point(size=1, alpha=0.5, col=point.color)+ xlab("log2(rhesus/chimpanzee) RPF")+ylab("log2(rhesus/chimpanzee) protein")+ggtitle("RvC: post-translational buffering")+scale_x_continuous(limits = c(-10, 10))+scale_y_continuous(limits = c(-10, 10))+theme_bw()+geom_abline(slope = 1, intercept = 0)
F3S1b
## Warning: Removed 1 rows containing missing values (geom_point).
######
#Fig3_S1b
# pdf("../../figures/Fig3S1b.pdf", width = 4, height = 4)
# F3S1b
# dev.off()
######
# p value qqplot
RvC.translation.buffering.sorted.logP<--log10(sort(RvC.translation.buffering.pval))
RvC.protein.buffering.sorted.logP<--log10(sort(RvC.protein.buffering.pval))
expected <- c(1:length(RvC.protein.buffering.sorted.logP))
log.exp <- -(log10(expected / (length(expected)+1)))
plot(c(0,7), c(0,7), col="red", lwd=2, type="l", xlab="Expected (-log10 Pvalue)", ylab="Observed (-log10 Pvalue)", xlim=c(0,4), ylim=c(0,20), las=1, xaxs="i", yaxs="i", bty="l", main="rhesus vs chimpanzee gene expression buffering")
points(log.exp, RvC.translation.buffering.sorted.logP, pch=18, cex=.6, col="orange")
points(log.exp, RvC.protein.buffering.sorted.logP, pch=18, cex=.6, col="blue")
legend("topleft",c("translational","posttranslational"),col = c("orange","blue"),pch=18,cex=.6, bty="n", horiz=TRUE)
######
#Fig3_S2a
# pdf("../../figures/Fig3S2a.pdf", width = 4, height = 4)
#
# plot(c(0,7), c(0,7), col="red", lwd=2, type="l", xlab="Expected (-log10 p-value)", ylab="Observed (-log10 p-value)", xlim=c(0,4), ylim=c(0,20), las=1, xaxs="i", yaxs="i", bty="l", main="RvC: gene expression buffering")
# points(log.exp, RvC.translation.buffering.sorted.logP, pch=18, cex=.6, col="orange")
# points(log.exp, RvC.protein.buffering.sorted.logP, pch=18, cex=.6, col="blue")
# legend("topleft",c("translational","post-translational"),col = c("orange","blue"),pch=18,cex=.75, bty="n", horiz=TRUE)
#
#
# dev.off()
######
scatter plots for visualize results RvH (supplement)
library(ggplot2)
# rhesus vs human translation buffering plot
RvH.FWER<-p.adjust(RvH.translation.buffering.pval,method = "bonferroni")
length(which(RvH.FWER < 0.05))
## [1] 7
RvH.qval<- qvalue(RvH.translation.buffering.pval)$qvalues
length(which(RvH.qval < 0.01))
## [1] 14
# color by FWER value < 0.05
point.color <- cut(RvH.FWER,breaks = c (0,0.05,1), labels = c("blue","grey50"))
#point.color <- cut(RvH.qval,breaks = c (0,0.01,1), labels = c("blue","grey50"))
F3S1c <- ggplot(mapping = aes(x=RvH.RNA.effect,y=RvH.ribo.effect))+geom_point(size=1, alpha=0.5, col=point.color)+ xlab("log2(rhesus/human) RNA")+ylab("log2(rhesus/human) RPF")+ggtitle("RvH: translational buffering")+scale_x_continuous(limits = c(-10, 10))+scale_y_continuous(limits = c(-10, 10))+theme_bw()+geom_abline(slope = 1, intercept = 0)
F3S1c
## Warning: Removed 1 rows containing missing values (geom_point).
######
#Fig3_S1c
# pdf("../../figures/Fig3S1c.pdf", width = 4, height = 4)
# F3S1c
# dev.off()
######
# rhesus vs human protein buffering plot
RvH.FWER<-p.adjust(RvH.protein.buffering.pval,method = "bonferroni")
length(which(RvH.FWER < 0.05))
## [1] 57
RvH.qval<- qvalue(RvH.protein.buffering.pval)$qvalues
length(which(RvH.qval < 0.01))
## [1] 234
# color by FWER value < 0.01
point.color <- cut(RvH.FWER,breaks = c (0,0.05,1), labels = c("blue","grey50"))
#point.color <- cut(RvH.qval,breaks = c (0,0.01,1), labels = c("blue","grey50"))
F3S1d <- ggplot(mapping = aes(x=RvH.ribo.effect,y=RvH.protein.effect))+geom_point(size=1, alpha=0.5, col=point.color)+ xlab("log2(rhesus/human) RPF")+ylab("log2(rhesus/human) protein")+ggtitle("RvH: post-translational buffering")+scale_x_continuous(limits = c(-10, 10))+scale_y_continuous(limits = c(-10, 10))+theme_bw()+geom_abline(slope = 1, intercept = 0)
F3S1d
## Warning: Removed 1 rows containing missing values (geom_point).
######
#Fig3_S1d
# pdf("../../figures/Fig3S1d.pdf", width = 4, height = 4)
# F3S1d
# dev.off()
######
# p value qqplot
RvH.translation.buffering.sorted.logP<--log10(sort(RvH.translation.buffering.pval))
RvH.protein.buffering.sorted.logP<--log10(sort(RvH.protein.buffering.pval))
expected <- c(1:length(RvH.protein.buffering.sorted.logP))
log.exp <- -(log10(expected / (length(expected)+1)))
plot(c(0,7), c(0,7), col="red", lwd=2, type="l", xlab="Expected (-log10 Pvalue)", ylab="Observed (-log10 Pvalue)", xlim=c(0,4), ylim=c(0,20), las=1, xaxs="i", yaxs="i", bty="l", main="rhesus vs human gene expression buffering")
points(log.exp, RvH.translation.buffering.sorted.logP, pch=18, cex=.6, col="orange")
points(log.exp, RvH.protein.buffering.sorted.logP, pch=18, cex=.6, col="blue")
legend("topleft",c("translational","posttranslational"),col = c("orange","blue"),pch=18,cex=.6, bty="n", horiz=TRUE)
######
#Fig3_S2b
# pdf("../../figures/Fig3S2b.pdf", width = 4, height = 4)
# plot(c(0,7), c(0,7), col="red", lwd=2, type="l", xlab="Expected (-log10 p-value)", ylab="Observed (-log10 p-value)", xlim=c(0,4), ylim=c(0,20), las=1, xaxs="i", yaxs="i", bty="l", main="RvH: gene expression buffering")
# points(log.exp, RvH.translation.buffering.sorted.logP, pch=18, cex=.6, col="orange")
# points(log.exp, RvH.protein.buffering.sorted.logP, pch=18, cex=.6, col="blue")
# legend("topleft",c("translational","post-translational"),col = c("orange","blue"),pch=18,cex=.75, bty="n", horiz=TRUE)
# dev.off()
######
make datafrmae for protein buffering p values and effect sizes
library(plyr)
protein.buffering.results <- as.data.frame(expressed.gene.names)
names(protein.buffering.results)[1] <- "gene.symbol"
protein.buffering.results$HvC.pval <- HvC.protein.buffering.pval
protein.buffering.results$RvC.pval <- RvC.protein.buffering.pval
protein.buffering.results$RvH.pval <- RvH.protein.buffering.pval
# label genes with ENSGID (rownames)
protein.buffering.results$ENSG <- rownames(protein.buffering.results)
#read in protein length
read.table("../rdas/uniprot.reviewd.genename.proteinlength.030515.table", sep="\t", header=T)->protein.length
names(protein.length)<-c("gene.symbol","protein.length.aa")
# compute median protein length for each gene
int.features <- function(x) {
c(protein.length.aa=median(x$protein.length.aa))
}
int.protein.length <- ddply(protein.length, c("gene.symbol"), int.features)
protein.buffering.results.length <- merge(protein.buffering.results,int.protein.length)
Check gene length bias and GC bias for buffered genes (supplement Figure 3 S3)
# load data for UTR length, GC content and such (supplement)
gene.stats <- read.delim(gzfile("../rdas/gene_stats.txt.gz"), stringsAsFactors=F)
int.features <- function(x) {
c(
coding.len=median(x$coding.len),
gc.content=median(x$gc.content),
total.len=median(x$coding.len + x$utr5.len + x$utr3.len)
)
}
int.stats <- ddply(gene.stats, c("ENSG"), int.features)
gene.stats <- merge(protein.buffering.results, int.stats)
# check length bias
r2 <- paste("r^2 == ", round(cor(-log10(gene.stats$HvC.pval), log10(gene.stats$total.len))^2, 4))
F3S3a <- ggplot(mapping = aes(x=-log10(gene.stats$HvC.pval),y=log10(gene.stats$total.len)))+geom_point(size=0.75, alpha=0.3, col="black")+ xlab("-log10(p-value)")+ylab("log10[gene length (bp)]")+theme_bw()+stat_smooth(method = "lm")+ggtitle("HvC: length distribution for \n post-translationally buffered genes")+theme(plot.title = element_text(size = 12)) + annotate("text",label=r2, x=9, y=4,parse=TRUE)
F3S3a
######
#fig3_S3a
# pdf("../../figures/Fig3S3a.pdf", width = 4, height = 4)
# F3S3a
# dev.off()
######
r2 <- paste("r^2 == ", round(cor(-log10(gene.stats$RvC.pval), log10(gene.stats$total.len))^2, 4))
F3S3b <- ggplot(mapping = aes(x=-log10(gene.stats$RvC.pval),y=log10(gene.stats$total.len)))+geom_point(size=0.75, alpha=0.3, col="black")+ xlab("-log10(p-value)")+ylab("log10[gene length (bp)]")+theme_bw()+stat_smooth(method = "lm")+ggtitle("RvC: length distribution for \n post-translationally buffered genes")+theme(plot.title = element_text(size = 12)) + annotate("text",label=r2, x=9, y=4,parse=TRUE)
F3S3b
######
#fig3_S3b
# pdf("../../figures/Fig3S3b.pdf", width = 4, height = 4)
# F3S3b
# dev.off()
######
r2 <- paste("r^2 == ", round(cor(-log10(gene.stats$RvH.pval), log10(gene.stats$total.len))^2, 4))
F3S3c <- ggplot(mapping = aes(x=-log10(gene.stats$RvH.pval),y=log10(gene.stats$total.len)))+geom_point(size=0.75, alpha=0.3, col="black")+ xlab("-log10(p-value)")+ylab("log10[gene length (bp)]")+theme_bw()+stat_smooth(method = "lm")+ggtitle("RvH: length distribution for \n post-translationally buffered genes")+theme(plot.title = element_text(size = 12)) + annotate("text",label=r2, x=9, y=4,parse=TRUE)
F3S3c
######
#fig3_S3c
# pdf("../../figures/Fig3S3c.pdf", width = 4, height = 4)
# F3S3c
# dev.off()
######
# check GC content bias
r2 <- paste("r^2 == ", round(cor(-log10(gene.stats$HvC.pval), gene.stats$gc.content)^2, 4))
F3S3d <- ggplot(mapping = aes(x=-log10(gene.stats$HvC.pval),y=100*gene.stats$gc.content))+geom_point(size=0.75, alpha=0.3, col="black")+ xlab("-log10(p-value)")+ylab("GC content (%)")+theme_bw()+stat_smooth(method = "lm")+ggtitle("HvC: GC content distribution for \n post-translationally buffered genes")+theme(plot.title = element_text(size = 12)) + annotate("text",label=r2, x=9, y=65,parse=TRUE)
F3S3d
######
#fig3_S3d
# pdf("../../figures/Fig3S3d.pdf", width = 4, height = 4)
# F3S3d
# dev.off()
######
r2 <- paste("r^2 == ", round(cor(-log10(gene.stats$RvC.pval), gene.stats$gc.content)^2, 5))
F3S3e <- ggplot(mapping = aes(x=-log10(gene.stats$RvC.pval),y=100*gene.stats$gc.content))+geom_point(size=0.75, alpha=0.3, col="black")+ xlab("-log10(p-value)")+ylab("GC content (%)")+theme_bw()+stat_smooth(method = "lm")+ggtitle("RvC: GC content distribution for \n post-translationally buffered genes")+theme(plot.title = element_text(size = 12)) + annotate("text",label=r2, x=9, y=65,parse=TRUE)
F3S3e
######
#fig3_S3e
# pdf("../../figures/Fig3S3e.pdf", width = 4, height = 4)
# F3S3e
# dev.off()
######
r2 <- paste("r^2 == ", round(cor(-log10(gene.stats$RvH.pval), gene.stats$gc.content)^2, 4))
F3S3f <- ggplot(mapping = aes(x=-log10(gene.stats$RvH.pval),y=100*gene.stats$gc.content))+geom_point(size=0.75, alpha=0.3, col="black")+ xlab("-log10(p-value)")+ylab("GC content (%)")+theme_bw()+stat_smooth(method = "lm")+ggtitle("RvH: GC content distribution for \n post-translationally buffered genes")+theme(plot.title = element_text(size = 12)) + annotate("text",label=r2, x=9, y=65,parse=TRUE)
F3S3f
######
#fig3_S3f
# pdf("../../figures/Fig3S3f.pdf", width = 4, height = 4)
# F3S3f
# dev.off()
######
Enrichment analysis: dN (nubmer of AA substitutions standardized by possible sites) and dN/dS (purifying selection on coding region)
dN.dS.data <- read.delim(gzfile("../rdas/dNdS_pairwise.txt.gz"))
names(dN.dS.data)[1] <- "ENSG"
dN.dS.data <- dN.dS.data[dN.dS.data$dS.hc != 0,]
dN.dS.data$dN.dS.hc <- dN.dS.data$dN.hc / dN.dS.data$dS.hc
dN.dS.data$dN.dS.hr <- dN.dS.data$dN.hr / dN.dS.data$dS.hr
dN.dS.data$dN.dS.cr <- dN.dS.data$dN.cr / dN.dS.data$dS.cr
protein.buffering.pval.dNdS <- merge(protein.buffering.results.length, dN.dS.data)
# HvC
cor.test(-log10(protein.buffering.pval.dNdS$HvC.pval), protein.buffering.pval.dNdS$dN.dS.hc)
##
## Pearson's product-moment correlation
##
## data: -log10(protein.buffering.pval.dNdS$HvC.pval) and protein.buffering.pval.dNdS$dN.dS.hc
## t = -0.16421, df = 2498, p-value = 0.8696
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.04248283 0.03592177
## sample estimates:
## cor
## -0.00328558
cor.test(-log10(protein.buffering.pval.dNdS$HvC.pval), protein.buffering.pval.dNdS$dN.hc)
##
## Pearson's product-moment correlation
##
## data: -log10(protein.buffering.pval.dNdS$HvC.pval) and protein.buffering.pval.dNdS$dN.hc
## t = -0.088045, df = 2498, p-value = 0.9298
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.04096150 0.03744369
## sample estimates:
## cor
## -0.00176161
# RvC
cor.test(-log10(protein.buffering.pval.dNdS$RvC.pval), protein.buffering.pval.dNdS$dN.dS.cr)
##
## Pearson's product-moment correlation
##
## data: -log10(protein.buffering.pval.dNdS$RvC.pval) and protein.buffering.pval.dNdS$dN.dS.cr
## t = -1.4876, df = 2498, p-value = 0.137
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.068873343 0.009462803
## sample estimates:
## cor
## -0.02975095
cor.test(-log10(protein.buffering.pval.dNdS$RvC.pval), protein.buffering.pval.dNdS$dN.cr)
##
## Pearson's product-moment correlation
##
## data: -log10(protein.buffering.pval.dNdS$RvC.pval) and protein.buffering.pval.dNdS$dN.cr
## t = -1.3716, df = 2498, p-value = 0.1703
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.06656352 0.01178301
## sample estimates:
## cor
## -0.02743238
# RvH
cor.test(-log10(protein.buffering.pval.dNdS$RvH.pval), protein.buffering.pval.dNdS$dN.dS.hr)
##
## Pearson's product-moment correlation
##
## data: -log10(protein.buffering.pval.dNdS$RvH.pval) and protein.buffering.pval.dNdS$dN.dS.hr
## t = 0.40885, df = 2498, p-value = 0.6827
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03103263 0.04736757
## sample estimates:
## cor
## 0.008180037
cor.test(-log10(protein.buffering.pval.dNdS$RvH.pval), protein.buffering.pval.dNdS$dN.hr)
##
## Pearson's product-moment correlation
##
## data: -log10(protein.buffering.pval.dNdS$RvH.pval) and protein.buffering.pval.dNdS$dN.hr
## t = 0.48753, df = 2498, p-value = 0.6259
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.02945999 0.04893800
## sample estimates:
## cor
## 0.009753998
# pull test results together in a vector
# test for both positive and negative correlations
#dNdS
HvC.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$HvC.pval), protein.buffering.pval.dNdS$dN.dS.hc)
RvC.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$RvC.pval), protein.buffering.pval.dNdS$dN.dS.cr)
RvH.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$RvH.pval), protein.buffering.pval.dNdS$dN.dS.hr)
dNdS.results<- c("dNdS",HvC.cor.test$estimate,HvC.cor.test$p.value,RvC.cor.test$estimate,RvC.cor.test$p.value,RvH.cor.test$estimate,RvH.cor.test$p.value)
dNdS.results
## cor
## "dNdS" "-0.00328557991242245" "0.869575842003152"
## cor cor
## "-0.0297509526851727" "0.136979724874337" "0.0081800372136801"
##
## "0.682683392922031"
dNdS.dir <- substring(dNdS.results,1,1)
dNdS.dir[dNdS.dir == 0] <- "+"
dNdS.results[c(2,4,6)] <- dNdS.dir[c(2,4,6)]
#dN
HvC.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$HvC.pval), protein.buffering.pval.dNdS$dN.hc)
RvC.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$RvC.pval), protein.buffering.pval.dNdS$dN.cr)
RvH.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$RvH.pval), protein.buffering.pval.dNdS$dN.hr)
dN.results<- c("dN",HvC.cor.test$estimate,HvC.cor.test$p.value,RvC.cor.test$estimate,RvC.cor.test$p.value,RvH.cor.test$estimate,RvH.cor.test$p.value)
dN.results
## cor
## "dN" "-0.00176161034280036" "0.929847638090926"
## cor cor
## "-0.0274323812652774" "0.170315382935625" "0.00975399760402635"
##
## "0.625927019816301"
dN.dir <- substring(dN.results,1,1)
dN.dir[dN.dir == 0] <- "+"
dN.results[c(2,4,6)] <- dN.dir[c(2,4,6)]
#
# # test for only positive correlations
# #dNdS
# HvC.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$HvC.pval), protein.buffering.pval.dNdS$dN.dS.hc, alternative = "greater")
# RvC.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$RvC.pval), protein.buffering.pval.dNdS$dN.dS.cr, alternative = "greater")
# RvH.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$RvH.pval), protein.buffering.pval.dNdS$dN.dS.hr, alternative = "greater")
# dNdS.results<- c("dNdS",HvC.cor.test$estimate,HvC.cor.test$p.value,RvC.cor.test$estimate,RvC.cor.test$p.value,RvH.cor.test$estimate,RvH.cor.test$p.value)
#
# dNdS.results
#
# #dN
# HvC.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$HvC.pval), protein.buffering.pval.dNdS$dN.hc, alternative = "greater")
# RvC.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$RvC.pval), protein.buffering.pval.dNdS$dN.cr, alternative = "greater")
# RvH.cor.test <- cor.test(-log10(protein.buffering.pval.dNdS$RvH.pval), protein.buffering.pval.dNdS$dN.hr, alternative = "greater")
# dN.results<- c("dN",HvC.cor.test$estimate,HvC.cor.test$p.value,RvC.cor.test$estimate,RvC.cor.test$p.value,RvH.cor.test$estimate,RvH.cor.test$p.value)
#
#
# dN.results
#
###### table S2
### write out dN and dN/dS enrichment results as table S2
names(dN.results) <- c("","HvC.cor","HvC.pvalue","RvC.cor","RvC.pvalue","RvH.cor","RvH.pvalue")
substitution.rate <- rbind(dN.results, dNdS.results)
rownames(substitution.rate) <- c("Ka","Ka/Ks")
substitution.rate[,c(3,5,7)] <- format(as.numeric(substitution.rate[,c(3,5,7)]), digits=2)
#write.csv(substitution.rate[,-1],"../../tables/tableS2.csv", quote = F)
######
# plot out dN vs buffering p value for HvC
######### Figure
pvalue.by.dN <- list()
for (i in 1:4){
pvalue.by.dN[[i]] <- protein.buffering.pval.dNdS$dN.hc[which(-log10(protein.buffering.pval.dNdS$HvC.pval) > i-1 & -log10(protein.buffering.pval.dNdS$HvC.pval) <i) ]
}
pvalue.by.dN[[5]] <- protein.buffering.pval.dNdS$dN.hc[which(-log10(protein.buffering.pval.dNdS$HvC.pval) > 4)]
HvC.mean.pvalue.by.dN<- unlist(lapply(pvalue.by.dN,mean))
HvC.sd.pvalue.by.dN<- unlist(lapply(pvalue.by.dN,sd))
HvC.se.pvalue.by.dN <- HvC.sd.pvalue.by.dN/sqrt(lengths(pvalue.by.dN))
plot(c(1:5),HvC.mean.pvalue.by.dN, ylim = c(0,0.04),pch = 19,ylab = "Ka", xlab = ("-log10(p-value)"), main = "buffered genes are not enriched of nonsynonymous substitutions",xaxt="n", cex = 0.5, col = rgb(0,0,0))
axis(1,at = (1:5),labels = c("0~1","1~2","2~3","3~4","4~"))
arrows(c(1:5), HvC.mean.pvalue.by.dN-HvC.se.pvalue.by.dN, c(1:5), HvC.mean.pvalue.by.dN+HvC.se.pvalue.by.dN, length=0, angle=90, code=3, lwd = 4, col = rgb(0,0,0,0.5))
######
######
#figure 3d
# pdf("../../figures/Fig3d.pdf", width = 4, height = 3)
# plot(c(1:5),HvC.mean.pvalue.by.dN, ylim = c(0,0.04),pch = 19,ylab = "Ka", xlab = ("-log10(p-value)"), main = "Post-translationally buffered genes are \n not enriched for nonsynonymous substitutions",xaxt="n", cex = 0.5, col = rgb(0,0,0), cex.main=0.9)
# axis(1,at = (1:5),labels = c("0~1","1~2","2~3","3~4","4~"))
# arrows(c(1:5), HvC.mean.pvalue.by.dN-HvC.se.pvalue.by.dN, c(1:5), HvC.mean.pvalue.by.dN+HvC.se.pvalue.by.dN, length=0, angle=90, code=3, lwd = 4, col = rgb(0,0,0,0.5))
#
# dev.off()
######
Enrichment PTM
# read in data and save in a list
PTM.label <- c("Ubiquitination","Phosphorylation","Acetylation","Methylation","Sumoylation","O-GalNAc","O-GlcNAc")
number.reported.PTM.sites <- list()
for (i in 1:length(PTM.label)){
number.reported.PTM.sites[[i]] <- read.delim(gzfile(paste0("../rdas/PSP_Download_AUG_5_2016/",PTM.label[i],"_site_dataset.gz")), stringsAsFactors=F,skip = 3)
#number.reported.PTM.sites[[i]] <- read.delim(gzfile(paste0("~/Desktop/PSP_Download_AUG_5_2016/",PTM.label[i],"_site_dataset.gz")), stringsAsFactors=F,skip = 3)
}
# writ a funtion for enrichment test
Mod.site.enrich <- function(modData, DEresults, hitCutoff){
names(modData)[3] <- "gene.symbol"
modData$gene.symbol <- toupper(modData$gene.symbol)
modData$ORGANISM <- tolower(modData$ORGANISM)
modData <- modData[which(modData$ORGANISM == "human"),]
modData <- modData[which(modData$gene.symbol != ""),]
count.sites <- function(x) { c(n.sites=dim(x)[1])}
if(missing(hitCutoff)){
modData.sites <- ddply(modData, c("gene.symbol"), count.sites)
pval.modData.sites <- merge(DEresults, modData.sites)
# pull test results together in a vector
HvC.cor.test <- cor.test(-log10(pval.modData.sites$HvC.pval), pval.modData.sites$n.sites/pval.modData.sites$protein.length.aa)
RvC.cor.test <- cor.test(-log10(pval.modData.sites$RvC.pval), pval.modData.sites$n.sites/pval.modData.sites$protein.length.aa)
RvH.cor.test <- cor.test(-log10(pval.modData.sites$RvH.pval), pval.modData.sites$n.sites/pval.modData.sites$protein.length.aa)
}
else{
modData[is.na(modData)] <- 0
confident.hit.index <- modData$LT_LIT+modData$MS_LIT+modData$MS_CST > hitCutoff
modData.highHit <- modData[confident.hit.index,]
modData.lowHit <- modData[-confident.hit.index,]
modData.condifent.sites <- ddply(modData.highHit, c("gene.symbol"), count.sites)
modData.spurious.sites <- ddply(modData.lowHit, c("gene.symbol"), count.sites)
modData.sites <- merge(modData.condifent.sites,modData.spurious.sites,all.x = T, all.y = T, by = "gene.symbol")
# name the number of confident sites as n.sites for enrichment test
names(modData.sites)[2] <- "n.sites"
modData.sites[is.na(modData.sites)]<-0
pval.modData.sites <- merge(DEresults, modData.sites)
# pull test results together in a vector
HvC.cor.test <- cor.test(-log10(pval.modData.sites$HvC.pval), pval.modData.sites$n.sites/pval.modData.sites$protein.length.aa)
RvC.cor.test <- cor.test(-log10(pval.modData.sites$RvC.pval), pval.modData.sites$n.sites/pval.modData.sites$protein.length.aa)
RvH.cor.test <- cor.test(-log10(pval.modData.sites$RvH.pval), pval.modData.sites$n.sites/pval.modData.sites$protein.length.aa)
}
modData.results<- c(HvC.cor.test$estimate,HvC.cor.test$p.value,RvC.cor.test$estimate,RvC.cor.test$p.value,RvH.cor.test$estimate,RvH.cor.test$p.value)
return(modData.results)
}
# run the funtion to count
#Mod.site.enrich.bufferedGenes.results <- lapply(number.reported.PTM.sites, Mod.site.enrich, protein.buffering.results.length, 1)
Mod.site.enrich.bufferedGenes.results <- lapply(number.reported.PTM.sites, Mod.site.enrich, protein.buffering.results.length)
Mod.site.enrich.bufferedGenes.results <- as.data.frame(Mod.site.enrich.bufferedGenes.results, stringsAsFactors = FALSE)
names(Mod.site.enrich.bufferedGenes.results) <- PTM.label
rownames(Mod.site.enrich.bufferedGenes.results) <- c("HvC.cor","HvC.pvalue","RvC.cor","RvC.pvalue","RvH.cor","RvH.pvalue")
Mod.site.enrich.bufferedGenes.results[Mod.site.enrich.bufferedGenes.results == 0] <- "2.2e-16"
Mod.site.enrich.bufferedGenes.results<- t(Mod.site.enrich.bufferedGenes.results)
#Mod.site.enrich.bufferedGenes.results <- trimws(Mod.site.enrich.bufferedGenes.results,which = "both")
#Mod.site.enrich.bufferedGenes.results[c(2,4,6),]
#cor.dir <- substring(Mod.site.enrich.bufferedGenes.results,1,1)
#cor.dir[cor.dir != "-"] <- "+"
#Mod.site.enrich.bufferedGenes.results[,c(1,3,5)] <- cor.dir[,c(1,3,5)]
# write PTM enrichment results out as tableS3
######
tableS3 <- Mod.site.enrich.bufferedGenes.results
tableS3
## HvC.cor HvC.pvalue RvC.cor RvC.pvalue
## Ubiquitination 0.185587452 1.310943e-22 0.15375539 6.270373e-16
## Phosphorylation 0.100459144 2.509814e-08 0.04492751 1.287777e-02
## Acetylation 0.184705545 8.888862e-18 0.11617729 7.740240e-08
## Methylation 0.156457146 4.756005e-05 0.09550128 1.339736e-02
## Sumoylation 0.209472931 3.741665e-07 0.13502216 1.137768e-03
## O-GalNAc -0.114295236 3.763994e-01 -0.13392649 2.993812e-01
## O-GlcNAc -0.002262742 9.859577e-01 0.06410437 6.176825e-01
## RvH.cor RvH.pvalue
## Ubiquitination 0.17662206 1.350461e-20
## Phosphorylation 0.08283907 4.401834e-06
## Acetylation 0.13677337 2.389075e-10
## Methylation 0.14399371 1.842171e-04
## Sumoylation 0.15374536 2.069567e-04
## O-GalNAc 0.03314200 7.981672e-01
## O-GlcNAc 0.24875477 4.930859e-02
#rownames(tableS3) <- c("HvC", "RvC", "RvH")
#tableS3[,c(2,4,6)]<- format(as.numeric(tableS3[,c(2,4,6)]),digits = 3)
#write.csv(tableS3,"../../tables/tableS3.csv", quote = F)
######
# make bar plots to show correlation between buffering p value and nubmer of ubq sites
# HvC
ubiq <- read.delim(gzfile("../rdas/Ubiquitination_site_dataset.gz"), stringsAsFactors=F)
names(ubiq)[3] <- "gene.symbol"
ubiq$gene.symbol <- toupper(ubiq$gene.symbol)
# will have to change this line to fit the updated dataset from cell singnaling
ubiq$ORG <- tolower(ubiq$ORG)
ubiq <- ubiq[which(ubiq$ORG == "human"),]
ubiq <- ubiq[which(ubiq$gene.symbol != ""),]
count.sites <- function(x) { c(n.sites=dim(x)[1])}
ubiq.sites <- ddply(ubiq, c("gene.symbol"), count.sites)
protein.buffering.pval.ubiq.sites <- merge(protein.buffering.results.length, ubiq.sites)
pvalue.by.ubq <- list()
protein.buffering.pval.ubiq.sites$sites.per.aa <- protein.buffering.pval.ubiq.sites$n.sites/protein.buffering.pval.ubiq.sites$protein.length.aa
for (i in 1:4){
pvalue.by.ubq[[i]] <- protein.buffering.pval.ubiq.sites$sites.per.aa[which(-log10(protein.buffering.pval.ubiq.sites$HvC.pval) > i-1 & -log10(protein.buffering.pval.ubiq.sites$HvC.pval) <i) ]
}
pvalue.by.ubq[[5]] <- protein.buffering.pval.ubiq.sites$sites.per.aa[which(-log10(protein.buffering.pval.ubiq.sites$HvC.pval) > 4)]
HvC.mean.pvalue.by.ubq<- unlist(lapply(pvalue.by.ubq,mean))
HvC.sd.pvalue.by.ubq<- unlist(lapply(pvalue.by.ubq,sd))
HvC.se.pvalue.by.ubq <- HvC.sd.pvalue.by.ubq/sqrt(lengths(pvalue.by.ubq))
plot(c(1:5),HvC.mean.pvalue.by.ubq, ylim = c(0,0.04),pch = 19,ylab = "NO. of ubq site per aa", xlab = ("-log10(p-value)"), main = "buffered genes have more ubiquitination sites",xaxt="n", cex = 0.5, col = rgb(0,0,0))
axis(1,at = (1:5),labels = c("0~1","1~2","2~3","3~4","4~"))
arrows(c(1:5), HvC.mean.pvalue.by.ubq-HvC.se.pvalue.by.ubq, c(1:5), HvC.mean.pvalue.by.ubq+HvC.se.pvalue.by.ubq, length=0, angle=90, code=3, lwd = 4, col = rgb(1,0.5,0,0.5))
######
#figure 3e
# pdf("../../figures/Fig3e.pdf", width = 4, height = 3)
# plot(c(1:5),HvC.mean.pvalue.by.ubq, ylim = c(0,0.04),pch = 19,ylab = "NO. of ubq site per aa", xlab = ("-log10(p-value)"), main = "Post-translationally buffered genes have \n more ubiquitination sites",xaxt="n", cex = 0.5, col = rgb(0,0,0), cex.main=0.9)
# axis(1,at = (1:5),labels = c("0~1","1~2","2~3","3~4","4~"))
# arrows(c(1:5), HvC.mean.pvalue.by.ubq-HvC.se.pvalue.by.ubq, c(1:5), HvC.mean.pvalue.by.ubq+HvC.se.pvalue.by.ubq, length=0, angle=90, code=3, lwd = 4, col = rgb(1,0.5,0,0.5))
# dev.off()
######
Enrichment analysis: subcellular localization and GO
# uniprot
subcell<- read.delim(gzfile("../rdas/uniprot_reviewed_human_SubCell_GO.tab.gz"), stringsAsFactors=F)
names(subcell)[2] <- "gene.symbol"
protein.buffering.results.subcell<-merge(protein.buffering.results,subcell)
# create a list of subcellular locations of interests
SubCellLoc.list<- c("Cytoplasm","Endoplasmic", "Golgi", "cytosol", "Nucleus", "Lysosome", "Mitochondrion", "Peroxisome", "Membrane", "endosome", "Secreted", "P-body")
Enrichment.results <- as.data.frame(matrix(nrow = length(SubCellLoc.list),ncol = 8))
names(Enrichment.results) <- c("SubCellLoc","NO. of genes associated","HvC.cor","HvC.p.value","RvC.cor","RvC.p.value","HvR.cor","HvR.p.value")
# write a for loop to grep the subcellular location terms and test for association with significance of protein buffering
for (i in 1:length(SubCellLoc.list)){
E.index<-grep(SubCellLoc.list[i],protein.buffering.results.subcell$Subcellular.location..CC.,fixed = TRUE)
E.boolean<-rep(FALSE,dim(protein.buffering.results.subcell)[1])
E.boolean[E.index] <- TRUE
HvC.cor.test <- cor.test(-log10(protein.buffering.results.subcell$HvC.pval), as.numeric(E.boolean))
RvC.cor.test <- cor.test(-log10(protein.buffering.results.subcell$RvC.pval), as.numeric(E.boolean))
RvH.cor.test <- cor.test(-log10(protein.buffering.results.subcell$RvH.pval), as.numeric(E.boolean))
Enrichment.results[i,] <- c(SubCellLoc.list[i],paste0(table(E.boolean)[2],"|",table(E.boolean)[1]),HvC.cor.test$estimate,HvC.cor.test$p.value,RvC.cor.test$estimate,RvC.cor.test$p.value,RvH.cor.test$estimate,RvH.cor.test$p.value)
}
Enrichment.results$HvC.fdr <- p.adjust(as.numeric(Enrichment.results$HvC.p.value), method = "fdr")
Enrichment.results$RvC.fdr <- p.adjust(as.numeric(Enrichment.results$RvC.p.value), method = "fdr")
Enrichment.results$HvR.fdr <- p.adjust(as.numeric(Enrichment.results$HvR.p.value), method = "fdr")
results.table <- Enrichment.results[,c(1:3,5,7,4,6,8:11)]
results.table$HvC.cor[results.table$HvC.cor < 0] <- "-"
results.table$HvC.cor[results.table$HvC.cor > 0] <- "+"
results.table$RvC.cor[results.table$RvC.cor < 0] <- "-"
results.table$RvC.cor[results.table$RvC.cor > 0] <- "+"
results.table$HvR.cor[results.table$HvR.cor < 0] <- "-"
results.table$HvR.cor[results.table$HvR.cor > 0] <- "+"
results.table
## SubCellLoc NO. of genes associated HvC.cor RvC.cor HvR.cor
## 1 Cytoplasm 1393|1773 - + +
## 2 Endoplasmic 263|2903 - + +
## 3 Golgi 222|2944 - - -
## 4 cytosol 136|3030 - - -
## 5 Nucleus 1239|1927 + + +
## 6 Lysosome 64|3102 - - -
## 7 Mitochondrion 426|2740 - - -
## 8 Peroxisome 26|3140 - - -
## 9 Membrane 232|2934 - - +
## 10 endosome 112|3054 - - -
## 11 Secreted 55|3111 - - +
## 12 P-body 24|3142 - + +
## HvC.p.value RvC.p.value HvR.p.value
## 1 0.353170168551029 0.61238554009631 0.961129387661925
## 2 0.155259717132281 0.203645006040008 0.705632597351873
## 3 0.00215707163815855 0.228269832143733 0.155093645582068
## 4 0.195795490678322 0.0149695773129384 0.187326273709998
## 5 1.31793487614123e-05 0.330386821450183 0.53394532250764
## 6 0.25881406371456 0.31864410406083 0.764731116587812
## 7 0.00703194169114029 0.00022996898751702 0.0719722889664554
## 8 0.591660427616302 0.441228300379599 0.996177151428022
## 9 0.0424232017444528 0.736804480757727 0.287547951746081
## 10 0.0200610458718292 0.072581827610134 0.0105605410030724
## 11 0.666536531115389 0.581018818038218 0.16075586624712
## 12 0.972544938298368 0.245507484791751 0.484143058928144
## HvC.fdr RvC.fdr HvR.fdr
## 1 0.4708935581 0.668056953 0.9961772
## 2 0.3105194343 0.491014970 0.9176773
## 3 0.0129424298 0.491014970 0.4495831
## 4 0.3356494126 0.089817464 0.4495831
## 5 0.0001581522 0.495580232 0.8009180
## 6 0.3882210956 0.495580232 0.9176773
## 7 0.0281277668 0.002759628 0.4318337
## 8 0.7099925131 0.588304401 0.9961772
## 9 0.1018156842 0.736804481 0.5750959
## 10 0.0601831376 0.290327310 0.1267265
## 11 0.7271307612 0.668056953 0.4495831
## 12 0.9725449383 0.491014970 0.8009180
######
#write sub cellular localization results to tableS4
tableS4 <- results.table[,1:8]
tableS4[,6:8] <- format(as.numeric(as.matrix(tableS4[,6:8])),digit=3)
#write.csv(tableS4,"../../tables/tableS4.csv", quote = F,row.names = F)
######
## Gene Ontology terms
# create a list of GO terms associated with the portein "universe"
GO.ID.list<-unique(unlist(strsplit(protein.buffering.results.subcell$Gene.ontology.IDs,split = "; ")))
GO.results <- as.data.frame(matrix(nrow = length(GO.ID.list),ncol = 8))
names(GO.results) <- c("GO.ID","NO. of genes associated","HvC.cor","HvC.p.value","RvC.cor","RvC.p.value","HvR.cor","HvR.p.value")
# write a for loop to grep the GO term and test for association with protein buffering
for (i in 1:length(GO.ID.list)){
GO.index<-grep(GO.ID.list[i],protein.buffering.results.subcell$Gene.ontology.IDs)
GO.boolean<-rep(FALSE,dim(protein.buffering.results.subcell)[1])
GO.boolean[GO.index] <- TRUE
HvC.cor.test <- cor.test(-log10(protein.buffering.results.subcell$HvC.pval), as.numeric(GO.boolean))
RvC.cor.test <- cor.test(-log10(protein.buffering.results.subcell$RvC.pval), as.numeric(GO.boolean))
RvH.cor.test <- cor.test(-log10(protein.buffering.results.subcell$RvH.pval), as.numeric(GO.boolean))
GO.results[i,] <- c(GO.ID.list[i],paste0(table(GO.boolean)[2],"|",table(GO.boolean)[1]),HvC.cor.test$estimate,HvC.cor.test$p.value,RvC.cor.test$estimate,RvC.cor.test$p.value,RvH.cor.test$estimate,RvH.cor.test$p.value)
}
# asign a "minimum" p value of "2.220446e-16" for the enries that got rounded down to zero
GO.results$HvC.p.value[which(GO.results$HvC.p.value == 0)] <- "2.220446e-16"
GO.results$RvC.p.value[which(GO.results$RvC.p.value == 0)] <- "2.220446e-16"
GO.results$HvR.p.value[which(GO.results$HvR.p.value == 0)] <- "2.220446e-16"
GO.results$HvC.fdr <- p.adjust(as.numeric(GO.results$HvC.p.value), method = "fdr")
GO.results$RvC.fdr <- p.adjust(as.numeric(GO.results$RvC.p.value), method = "fdr")
GO.results$HvR.fdr <- p.adjust(as.numeric(GO.results$HvR.p.value), method = "fdr")
# load Gene Ontology ID and description table to print out a summary
IDtoTerm<- read.delim("../rdas/GO_IDtoTerm.table",header = F, stringsAsFactors = F,sep = "\t",skip = 11)
names(IDtoTerm) <- c("GO.ID", "Secondary.ID", "Term", "F|P|C","status")
GO.results <- merge(GO.results,IDtoTerm)
GO.top.table<- GO.results[GO.results$HvC.fdr < 0.01 & GO.results$HvR.fdr < 0.01 & GO.results$RvC.fdr < 0.01,]
GO.top.table[,c(1:3,5,7,9:11,13)]
## GO.ID NO. of genes associated HvC.cor
## 69 GO:0000184 51|3115 0.155935000039117
## 678 GO:0003723 192|2974 0.108004434601142
## 685 GO:0003735 90|3076 0.123324704998837
## 1555 GO:0006364 107|3059 0.15341015563038
## 1583 GO:0006412 91|3075 0.112808804862397
## 1584 GO:0006413 59|3107 0.179027242557559
## 1691 GO:0006614 40|3126 0.190260708182107
## 2739 GO:0015935 5|3161 0.0930941573118543
## 3068 GO:0019083 54|3112 0.181441053791468
## 3282 GO:0022625 23|3143 0.182912549810419
## 3284 GO:0022627 13|3153 0.0890843389977929
## 3491 GO:0030529 56|3110 0.0921453353430005
## RvC.cor HvR.cor HvC.fdr RvC.fdr
## 69 0.141932645061681 0.207453532035914 1.292855e-15 9.395638e-13
## 678 0.0778372563750377 0.0981378441141001 5.086252e-07 2.587043e-03
## 685 0.0791092847400972 0.127544715047509 2.110958e-09 2.094970e-03
## 1555 0.10020307060259 0.159632271908563 4.076119e-15 8.804281e-06
## 1583 0.0951133052613247 0.127119779871049 9.447601e-08 3.770882e-05
## 1584 0.145892327230887 0.257055550916626 6.720057e-21 1.632575e-13
## 1691 0.16505630794524 0.242415736986775 2.835354e-23 1.228997e-17
## 2739 0.103442966967946 0.0751383761742697 5.081010e-05 3.187952e-06
## 3068 0.159433097605577 0.228074676334547 2.131689e-21 2.112842e-16
## 3282 0.0906802102359322 0.192693368293101 1.319366e-21 1.258173e-04
## 3284 0.173702213633304 0.168552528350686 1.463760e-04 1.189189e-19
## 3491 0.0725262713266101 0.0771279286045538 6.287765e-05 7.290814e-03
## HvR.fdr
## 69 8.351700e-29
## 678 1.079789e-05
## 685 3.727421e-10
## 1555 1.211269e-16
## 1583 3.918865e-10
## 1584 4.868216e-45
## 1691 5.887188e-40
## 2739 3.072017e-03
## 3068 3.370142e-35
## 3282 7.580062e-25
## 3284 1.082676e-18
## 3491 1.948808e-03
## Term
## 69 nuclear-transcribed mRNA catabolic process, nonsense-mediated decay
## 678 RNA binding
## 685 structural constituent of ribosome
## 1555 rRNA processing
## 1583 translation
## 1584 translational initiation
## 1691 SRP-dependent cotranslational protein targeting to membrane
## 2739 small ribosomal subunit
## 3068 viral transcription
## 3282 cytosolic large ribosomal subunit
## 3284 cytosolic small ribosomal subunit
## 3491 ribonucleoprotein complex
######
# tableS5
tableS5 <- GO.top.table[,c(1,13,2,3,5,7,9:11)]
cor.dir <- substring(as.matrix(tableS5),1,1)
cor.dir[cor.dir != "-"] <- "+"
tableS5[,4:6] <- cor.dir[,4:6]
tableS5[,7:9] <- format(as.numeric(as.matrix(tableS5[,7:9])),digit=3)
tableS5$Term <- sub(","," &",tableS5$Term)
#write.csv(tableS5,"../../tables/tableS5.csv", quote = F,row.names = F)
######
# genes that are shared across GO enrichment terms
top.GO.index <- list()
top.GO.terms <- GO.top.table[,1]
for (i in 1:length(top.GO.terms)){
top.GO.index[[i]]<-grep(top.GO.terms[i],protein.buffering.results.subcell$Gene.ontology.IDs)
}
all.unique.index <- unique(unlist(top.GO.index))
top.GO.result <- matrix(nrow = length(all.unique.index), ncol = length(top.GO.terms))
colnames(top.GO.result) <- top.GO.terms
rownames(top.GO.result) <- protein.buffering.results.subcell$ENSG[all.unique.index]
for (i in 1:length(top.GO.index)){
top.GO.result[,i] <- as.numeric(all.unique.index %in% unlist(top.GO.index[i]))
}
colSums(top.GO.result[which(apply(top.GO.result,1,sum) > 5),])
## GO:0000184 GO:0003723 GO:0003735 GO:0006364 GO:0006412 GO:0006413
## 31 20 31 31 30 31
## GO:0006614 GO:0015935 GO:0019083 GO:0022625 GO:0022627 GO:0030529
## 31 4 31 20 11 9
colnames(top.GO.result) <- GO.top.table$Term
colSums(top.GO.result[which(rowSums(top.GO.result) > 5),])
## nuclear-transcribed mRNA catabolic process, nonsense-mediated decay
## 31
## RNA binding
## 20
## structural constituent of ribosome
## 31
## rRNA processing
## 31
## translation
## 30
## translational initiation
## 31
## SRP-dependent cotranslational protein targeting to membrane
## 31
## small ribosomal subunit
## 4
## viral transcription
## 31
## cytosolic large ribosomal subunit
## 20
## cytosolic small ribosomal subunit
## 11
## ribonucleoprotein complex
## 9
share.matrix <- matrix(nrow = 12,ncol = 12)
for (i in 1:12){
share.matrix[,i]<- colSums(top.GO.result[which(top.GO.result[,i] == 1),])/sum(top.GO.result[which(top.GO.result[,i] == 1),i])
}
rownames(share.matrix) <- GO.top.table$Term
colnames(share.matrix) <- GO.top.table$Term
Protein Protein Interactions
# Protein protein interaction
# the 3.4.139 version is updated on 07/26/16
biogrid <- read.delim(gzfile("../rdas/BIOGRID-ORGANISM-Homo_sapiens-3.4.139.tab2.txt.gz"), stringsAsFactors=F)
biogrid <- biogrid[which(biogrid$Experimental.System.Type == "physical"),]
# from the contributing guide line, I learned that interactor A are the baits used in each experiment and interactor B are the hits. So it makes more sense to focus on nubmer of hits (interactor B) when reoprting number of protein protein interactions for each gene. Since nubmer of times for a gene being used as baits (interactor A) mainly reflects which genes people are more interested in studying. While counting interactor B is more appropriate, it still suffers from "publication bias".
# simply sum up the nubmer of times a gene is show up as a hit (interactor B)
interactor.B.counts <- function(x) {
c(counts.B=dim(x)[1])
}
I.cnts.B <- ddply(biogrid, c("Official.Symbol.Interactor.B"), interactor.B.counts)
names(I.cnts.B)[1] <- "gene.symbol"
protein.buffering.pval.PPI <- merge(protein.buffering.results.length,I.cnts.B)
HvC.cor.test <- cor.test(-log10(protein.buffering.pval.PPI$HvC.pval), protein.buffering.pval.PPI$counts.B)
RvC.cor.test <- cor.test(-log10(protein.buffering.pval.PPI$RvC.pval), protein.buffering.pval.PPI$counts.B)
RvH.cor.test <- cor.test(-log10(protein.buffering.pval.PPI$RvH.pval), protein.buffering.pval.PPI$counts.B)
PPI.results<- c("PPI",HvC.cor.test$estimate,HvC.cor.test$p.value,RvC.cor.test$estimate,RvC.cor.test$p.value,RvH.cor.test$estimate,RvH.cor.test$p.value)
PPI.results
## cor
## "PPI" "0.120149452814719" "1.4336910344719e-11"
## cor cor
## "0.0866078371089322" "1.1718834649188e-06" "0.0753096576741107"
##
## "2.39632359920125e-05"
relaxation in transcription regulation
read.csv("../rdas/battle_etal_RNA_logCPM_data.csv")->YRI.RNA.CPM
length(YRI.RNA.CPM$ENSG)
## [1] 16614
# TMM normalization
# calculate M value (log2 fold change relative to GM19238)
YRI.RNA.M <- YRI.RNA.CPM[,2:76]-YRI.RNA.CPM$GM19238
Tmean<-function(m,fractionTrimmed){
n<-length(m)
lo <- floor(n*fractionTrimmed)+1
hi <- n + 1 -lo
keep <- rank(m) > lo & rank(m) < hi
mean(m[keep], na.rm = TRUE)
}
RNA.trimmed.mean.M.values <- apply(YRI.RNA.M,2,Tmean,0.3)
YRI.RNA.CPM.TMM <- t(t(YRI.RNA.CPM[,2:76]) - RNA.trimmed.mean.M.values)
# qunatile normalization
library(limma)
YRI.RNA.CPM.QN <- normalizeQuantiles(YRI.RNA.CPM[,2:76])
CPM.mean <- apply(YRI.RNA.CPM[,2:76],1,mean)
CPM.sd <- apply(YRI.RNA.CPM[,2:76],1,sd)
CPM.TMM.mean <- apply(YRI.RNA.CPM.TMM,1,mean)
CPM.TMM.sd <- apply(YRI.RNA.CPM.TMM,1,sd)
CPM.QN.mean <- apply(YRI.RNA.CPM.QN,1,mean)
CPM.QN.sd <- apply(YRI.RNA.CPM.QN,1,sd)
YRI.RNA.CPM.mean.sd <- as.data.frame(YRI.RNA.CPM$ENSG)
YRI.RNA.CPM.mean.sd$RNA.mean <- CPM.mean
YRI.RNA.CPM.mean.sd$RNA.sd <- CPM.sd
YRI.RNA.CPM.mean.sd$RNA.TMM.mean <- CPM.TMM.mean
YRI.RNA.CPM.mean.sd$RNA.TMM.sd <- CPM.TMM.sd
YRI.RNA.CPM.mean.sd$RNA.QN.mean <- CPM.QN.mean
YRI.RNA.CPM.mean.sd$RNA.QN.sd <- CPM.QN.sd
names(YRI.RNA.CPM.mean.sd)[1] <- "ENSG"
protein.buffering.results.YRI <- merge(protein.buffering.results, YRI.RNA.CPM.mean.sd)
F3S5 <- ggplot(mapping = aes(x=protein.buffering.results.YRI$RNA.mean,y=protein.buffering.results.YRI$RNA.sd))+geom_point(size=0.75, alpha=0.3, col="black")+ xlab("log2CPM")+ylab("standard deviation")+ggtitle("")+theme_bw()+stat_smooth(method = "loess")+ggtitle("YRI: RNA-seq mean-variance relationship")+theme(plot.title = element_text(size = 12))
F3S5
######
#Fig3_S5
# pdf("../../figures/Fig3S5.pdf", width = 4, height = 3)
# F3S5
# dev.off()
######
# load YRI protein expression data
YRI.protein <- read.csv("../rdas/battle_etal_protein_log2_SILAC_ratio_data.csv")
boxplot(YRI.protein[,c(2:63)], outline = FALSE)
YRI.protein.mean <- apply(YRI.protein[,c(2:63)],1,mean,na.rm = TRUE)
YRI.protein.sd <- apply(YRI.protein[,c(2:63)],1,sd,na.rm = TRUE)
Protein.trimmed.mean.M.values <- apply(YRI.protein[,c(2:63)],2,Tmean,0.3)
YRI.protein.TMM <- t(t(YRI.protein[,c(2:63)]) - Protein.trimmed.mean.M.values)
YRI.protein.TMM.mean <- apply(YRI.protein.TMM,1,mean,na.rm = TRUE)
YRI.protein.TMM.sd <- apply(YRI.protein.TMM,1,sd,na.rm = TRUE)
YRI.protein.QN <- normalizeQuantiles(YRI.protein[,c(2:63)])
YRI.protein.QN.mean <- apply(YRI.protein.QN,1,mean,na.rm = TRUE)
YRI.protein.QN.sd <- apply(YRI.protein.QN,1,sd,na.rm = TRUE)
protein.mean.sd <- as.data.frame(YRI.protein$ENSG)
protein.mean.sd$protein.mean <- YRI.protein.mean
protein.mean.sd$protein.sd <- YRI.protein.sd
protein.mean.sd$protein.TMM.mean <- YRI.protein.TMM.mean
protein.mean.sd$protein.TMM.sd <- YRI.protein.TMM.sd
protein.mean.sd$protein.QN.mean <- YRI.protein.QN.mean
protein.mean.sd$protein.QN.sd <- YRI.protein.QN.sd
names(protein.mean.sd)[1] <- "ENSG"
protein.buffering.results.YRI <- merge(protein.buffering.results.YRI, protein.mean.sd)
YRI.expression.features <- protein.buffering.results.YRI[,-c(1:5)]
# write a loop to test association with each feature
Enrichment.results <- as.data.frame(matrix(nrow = length(YRI.expression.features[1,]),ncol = 7))
names(Enrichment.results) <- c("feature","HvC.cor","HvC.p.value","RvC.cor","RvC.p.value","HvR.cor","HvR.p.value")
# write a for loop to grep the subcellular location terms and test for association with significance of protein buffering
for (i in 1:length(YRI.expression.features[1,])){
HvC.cor.test <- cor.test(-log10(protein.buffering.results.YRI$HvC.pval), YRI.expression.features[,i])
RvC.cor.test <- cor.test(-log10(protein.buffering.results.YRI$RvC.pval), YRI.expression.features[,i])
RvH.cor.test <- cor.test(-log10(protein.buffering.results.YRI$RvH.pval), YRI.expression.features[,i])
Enrichment.results[i,] <- c(names(YRI.expression.features)[i],HvC.cor.test$estimate,HvC.cor.test$p.value,RvC.cor.test$estimate,RvC.cor.test$p.value,RvH.cor.test$estimate,RvH.cor.test$p.value)
}
Enrichment.results
## feature HvC.cor HvC.p.value
## 1 RNA.mean 0.055439295351886 0.0020160390406914
## 2 RNA.sd 0.0905090216562931 4.47255963613277e-07
## 3 RNA.TMM.mean 0.055439295351886 0.00201603904069141
## 4 RNA.TMM.sd 0.0871762751475496 1.16753826023672e-06
## 5 RNA.QN.mean 0.0559701107066437 0.00182409973745792
## 6 RNA.QN.sd 0.101050083156408 1.71638902608821e-08
## 7 protein.mean -0.0568372285411644 0.00154632657265966
## 8 protein.sd -0.0101725403209787 0.571278697451835
## 9 protein.TMM.mean -0.0570968612537291 0.00147106844312246
## 10 protein.TMM.sd -0.00289737335191094 0.871893194707263
## 11 protein.QN.mean -0.0540943516932361 0.00258817601924473
## 12 protein.QN.sd -0.0241450223217146 0.178951072419093
## RvC.cor RvC.p.value HvR.cor
## 1 0.0100561328204343 0.575690821689558 -0.00622333882269453
## 2 0.0724237019420466 5.4357353566756e-05 0.0941594924381421
## 3 0.0100561328204343 0.575690821689558 -0.00622333882269453
## 4 0.0696113432590646 0.000104913350876627 0.0868166963592504
## 5 0.0102171961626635 0.569590412194896 -0.00601680989118886
## 6 0.0785894882372496 1.18304443728429e-05 0.100013448451604
## 7 -0.0271913880525138 0.130121817761927 -0.0347495166764506
## 8 -0.00419813392274674 0.815257510641394 0.0152640462614856
## 9 -0.0272497309412729 0.129299363771228 -0.0347082922714081
## 10 0.000328264875309194 0.985423743919515 0.0174835502722664
## 11 -0.0260095370630735 0.147670024558935 -0.0331050867125738
## 12 -0.0110700369465528 0.537812766855007 0.0119923841058773
## HvR.p.value
## 1 0.729068837621651
## 2 1.50345165823514e-07
## 3 0.729068837621651
## 4 1.2922527106569e-06
## 5 0.737723363590719
## 6 2.4015507552288e-08
## 7 0.0530424523002832
## 8 0.395562602411551
## 9 0.0533251138717885
## 10 0.330491346876476
## 11 0.0653326640556376
## 12 0.504476818236239
###########
# write YRI transcipt level variation enrichment resutls to TableS6
tableS6 <- Enrichment.results[1:6,]
cor.dir <- substring(as.matrix(tableS6),1,1)
cor.dir[cor.dir != "-"] <- "+"
tableS6[,c(2,4,6)] <- cor.dir[,c(2,4,6)]
tableS6[,c(3,5,7)] <- format(as.numeric(as.matrix(tableS6[,c(3,5,7)])),digit=3)
#write.csv(tableS6,"../../tables/tableS6.csv", quote = F,row.names = F)
######
Make plots set cut off at FWER 0.05
#HvC
# RNA
buffered.RNA.sd <- protein.buffering.results.YRI$RNA.sd[p.adjust(protein.buffering.results.YRI$HvC.pval,method = "bonferroni") < 0.05]
background.RNA.sd <- protein.buffering.results.YRI$RNA.sd[!p.adjust(protein.buffering.results.YRI$HvC.pval,method = "bonferroni") < 0.05]
HvC.mean.sd.buffered <- mean(buffered.RNA.sd, na.rm = T)
HvC.sd.sd.buffered <- sd(buffered.RNA.sd, na.rm = T)
HvC.se.sd.buffered <- HvC.sd.sd.buffered/sqrt(length(buffered.RNA.sd))
HvC.mean.sd.background <- mean(background.RNA.sd, na.rm = T)
HvC.sd.sd.background <- sd(background.RNA.sd, na.rm = T)
HvC.se.sd.background <- HvC.sd.sd.background/sqrt(length(background.RNA.sd))
wilcox.test(buffered.RNA.sd,background.RNA.sd,alternative = "greater")
##
## Wilcoxon rank sum test with continuity correction
##
## data: buffered.RNA.sd and background.RNA.sd
## W = 68272, p-value = 0.0009306
## alternative hypothesis: true location shift is greater than 0
plot(c(1:4),c("",HvC.mean.sd.background,HvC.mean.sd.buffered,""), ylim = c(0.2,0.6),pch = 19,ylab = "standard deviation", main = "Cross primate buffered genes have higher variation in YRI transcript levels",xaxt="n", cex = 0.5, col = rgb(1,0.5,0), xlab = "")
axis(1,at = c(1:4),labels = c("","background","buffered",""),tick = FALSE)
arrows(c(1:4), c(0,HvC.mean.sd.background-HvC.se.sd.background,HvC.mean.sd.buffered-HvC.se.sd.background,0), c(1:4),c(0,HvC.mean.sd.background+HvC.se.sd.background,HvC.mean.sd.buffered+HvC.se.sd.background,0), length=0, angle=90, code=3, lwd = 5, col = rgb(1,0.5,0,0.5))
# protein
buffered.protien.sd <- protein.buffering.results.YRI$protein.sd[p.adjust(protein.buffering.results.YRI$HvC.pval,method = "bonferroni") < 0.05]
background.protein.sd <- protein.buffering.results.YRI$protein.sd[!p.adjust(protein.buffering.results.YRI$HvC.pval,method = "bonferroni") < 0.05]
HvC.mean.sd.buffered <- mean(buffered.protien.sd, na.rm = T)
HvC.sd.sd.buffered <- sd(buffered.protien.sd, na.rm = T)
HvC.se.sd.buffered <- HvC.sd.sd.buffered/sqrt(length(buffered.protien.sd))
HvC.mean.sd.background <- mean(background.protein.sd, na.rm = T)
HvC.sd.sd.background <- sd(background.protein.sd, na.rm = T)
HvC.se.sd.background <- HvC.sd.sd.background/sqrt(length(background.protein.sd))
wilcox.test(buffered.protien.sd,background.protein.sd,alternative = "less")
##
## Wilcoxon rank sum test with continuity correction
##
## data: buffered.protien.sd and background.protein.sd
## W = 42270, p-value = 0.02884
## alternative hypothesis: true location shift is less than 0
points(c(1:4),c("",HvC.mean.sd.background,HvC.mean.sd.buffered,""),pch = 19, cex = 0.5, col = rgb(0,0,0))
arrows(c(1:4), c(0,HvC.mean.sd.background-HvC.se.sd.background,HvC.mean.sd.buffered-HvC.se.sd.background,0), c(1:4),c(0,HvC.mean.sd.background+HvC.se.sd.background,HvC.mean.sd.buffered+HvC.se.sd.background,0), length=0, angle=90, code=3, lwd = 5, col = rgb(0,0,0,0.5))
legend("topright", inset=c(-0.2,0),legend = c("transcript","protein"),col = c("orange","black"),pch = 19,cex = 0.5,bty = "n",xpd = TRUE)
# supplement
#RvC
# RNA
buffered.RNA.sd <- protein.buffering.results.YRI$RNA.sd[p.adjust(protein.buffering.results.YRI$RvC.pval,method = "bonferroni") < 0.05]
background.RNA.sd <- protein.buffering.results.YRI$RNA.sd[!p.adjust(protein.buffering.results.YRI$RvC.pval,method = "bonferroni") < 0.05]
RvC.mean.sd.buffered <- mean(buffered.RNA.sd, na.rm = T)
RvC.sd.sd.buffered <- sd(buffered.RNA.sd, na.rm = T)
RvC.se.sd.buffered <- RvC.sd.sd.buffered/sqrt(length(buffered.RNA.sd))
RvC.mean.sd.background <- mean(background.RNA.sd, na.rm = T)
RvC.sd.sd.background <- sd(background.RNA.sd, na.rm = T)
RvC.se.sd.background <- RvC.sd.sd.background/sqrt(length(background.RNA.sd))
wilcox.test(buffered.RNA.sd,background.RNA.sd,alternative = "greater")
##
## Wilcoxon rank sum test with continuity correction
##
## data: buffered.RNA.sd and background.RNA.sd
## W = 77986, p-value = 0.03406
## alternative hypothesis: true location shift is greater than 0
plot(c(1:4),c("",RvC.mean.sd.background,RvC.mean.sd.buffered,""), ylim = c(0.2,0.6),pch = 19,ylab = "standard deviation", main = "Cross primate buffered genes have higher variation in YRI transcript levels",xaxt="n", cex = 0.5, col = rgb(1,0.5,0), xlab = "")
axis(1,at = c(1:4),labels = c("","background","buffered",""),tick = FALSE)
arrows(c(1:4), c(0,RvC.mean.sd.background-RvC.se.sd.background,RvC.mean.sd.buffered-RvC.se.sd.background,0), c(1:4),c(0,RvC.mean.sd.background+RvC.se.sd.background,RvC.mean.sd.buffered+RvC.se.sd.background,0), length=0, angle=90, code=3, lwd = 5, col = rgb(1,0.5,0,0.5))
# protein
buffered.protien.sd <- protein.buffering.results.YRI$protein.sd[p.adjust(protein.buffering.results.YRI$RvC.pval,method = "bonferroni") < 0.05]
background.protein.sd <- protein.buffering.results.YRI$protein.sd[!p.adjust(protein.buffering.results.YRI$RvC.pval,method = "bonferroni") < 0.05]
RvC.mean.sd.buffered <- mean(buffered.protien.sd, na.rm = T)
RvC.sd.sd.buffered <- sd(buffered.protien.sd, na.rm = T)
RvC.se.sd.buffered <- RvC.sd.sd.buffered/sqrt(length(buffered.protien.sd))
RvC.mean.sd.background <- mean(background.protein.sd, na.rm = T)
RvC.sd.sd.background <- sd(background.protein.sd, na.rm = T)
RvC.se.sd.background <- RvC.sd.sd.background/sqrt(length(background.protein.sd))
wilcox.test(buffered.protien.sd,background.protein.sd,alternative = "less")
##
## Wilcoxon rank sum test with continuity correction
##
## data: buffered.protien.sd and background.protein.sd
## W = 46730, p-value = 0.0002526
## alternative hypothesis: true location shift is less than 0
points(c(1:4),c("",RvC.mean.sd.background,RvC.mean.sd.buffered,""),pch = 19, cex = 0.5, col = rgb(0,0,0))
arrows(c(1:4), c(0,RvC.mean.sd.background-RvC.se.sd.background,RvC.mean.sd.buffered-RvC.se.sd.background,0), c(1:4),c(0,RvC.mean.sd.background+RvC.se.sd.background,RvC.mean.sd.buffered+RvC.se.sd.background,0), length=0, angle=90, code=3, lwd = 5, col = rgb(0,0,0,0.5))
legend("topright", inset=c(-0.2,0),legend = c("transcript","protein"),col = c("orange","black"),pch = 19,cex = 0.5,bty = "n",xpd = TRUE)
#RvH
# RNA
buffered.RNA.sd <- protein.buffering.results.YRI$RNA.sd[p.adjust(protein.buffering.results.YRI$RvH.pval,method = "bonferroni") < 0.05]
background.RNA.sd <- protein.buffering.results.YRI$RNA.sd[!p.adjust(protein.buffering.results.YRI$RvH.pval,method = "bonferroni") < 0.05]
RvH.mean.sd.buffered <- mean(buffered.RNA.sd, na.rm = T)
RvH.sd.sd.buffered <- sd(buffered.RNA.sd, na.rm = T)
RvH.se.sd.buffered <- RvH.sd.sd.buffered/sqrt(length(buffered.RNA.sd))
RvH.mean.sd.background <- mean(background.RNA.sd, na.rm = T)
RvH.sd.sd.background <- sd(background.RNA.sd, na.rm = T)
RvH.se.sd.background <- RvH.sd.sd.background/sqrt(length(background.RNA.sd))
wilcox.test(buffered.RNA.sd,background.RNA.sd,alternative = "greater")
##
## Wilcoxon rank sum test with continuity correction
##
## data: buffered.RNA.sd and background.RNA.sd
## W = 100850, p-value = 0.0009304
## alternative hypothesis: true location shift is greater than 0
plot(c(1:4),c("",RvH.mean.sd.background,RvH.mean.sd.buffered,""), ylim = c(0.2,0.6),pch = 19,ylab = "standard deviation", main = "Cross primate buffered genes have higher variation in YRI transcript levels",xaxt="n", cex = 0.5, col = rgb(1,0.5,0), xlab = "")
axis(1,at = c(1:4),labels = c("","background","buffered",""),tick = FALSE)
arrows(c(1:4), c(0,RvH.mean.sd.background-RvH.se.sd.background,RvH.mean.sd.buffered-RvH.se.sd.background,0), c(1:4),c(0,RvH.mean.sd.background+RvH.se.sd.background,RvH.mean.sd.buffered+RvH.se.sd.background,0), length=0, angle=90, code=3, lwd = 5, col = rgb(1,0.5,0,0.5))
# protein
buffered.protien.sd <- protein.buffering.results.YRI$protein.sd[p.adjust(protein.buffering.results.YRI$RvH.pval,method = "bonferroni") < 0.05]
background.protein.sd <- protein.buffering.results.YRI$protein.sd[!p.adjust(protein.buffering.results.YRI$RvH.pval,method = "bonferroni") < 0.05]
RvH.mean.sd.buffered <- mean(buffered.protien.sd, na.rm = T)
RvH.sd.sd.buffered <- sd(buffered.protien.sd, na.rm = T)
RvH.se.sd.buffered <- RvH.sd.sd.buffered/sqrt(length(buffered.protien.sd))
RvH.mean.sd.background <- mean(background.protein.sd, na.rm = T)
RvH.sd.sd.background <- sd(background.protein.sd, na.rm = T)
RvH.se.sd.background <- RvH.sd.sd.background/sqrt(length(background.protein.sd))
wilcox.test(buffered.protien.sd,background.protein.sd,alternative = "less")
##
## Wilcoxon rank sum test with continuity correction
##
## data: buffered.protien.sd and background.protein.sd
## W = 61917, p-value = 0.001781
## alternative hypothesis: true location shift is less than 0
points(c(1:4),c("",RvH.mean.sd.background,RvH.mean.sd.buffered,""),pch = 19, cex = 0.5, col = rgb(0,0,0))
arrows(c(1:4), c(0,RvH.mean.sd.background-RvH.se.sd.background,RvH.mean.sd.buffered-RvH.se.sd.background,0), c(1:4),c(0,RvH.mean.sd.background+RvH.se.sd.background,RvH.mean.sd.buffered+RvH.se.sd.background,0), length=0, angle=90, code=3, lwd = 5, col = rgb(0,0,0,0.5))
legend("topright", inset=c(-0.2,0),legend = c("transcript","protein"),col = c("orange","black"),pch = 19,cex = 0.5,bty = "n",xpd = TRUE)
gene expression variation in YRI across multiple Human-chimp buffering significance cutoffs
# HvC
#SD
sd.by.pvalue <- list()
for (i in 1:4){
sd.by.pvalue[[i]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$HvC.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$HvC.pval) <i*2) ]
}
sd.by.pvalue[[5]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$HvC.pval) > 8)]
HvC.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
HvC.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
HvC.se.sd.by.pval <- HvC.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
plot(c(1:5),HvC.mean.sd.by.pval, ylim = c(0,1),pch = 19,ylab = "standard deviation", xlab = ("-log10(p-value)"), main = "PT buffered genes",xaxt="n", cex = 0.5, col = rgb(0,0,0,0.5))
axis(1,at = (1:5),labels = c("0~2","2~4","4~6","6~8","8~"))
arrows(c(1:5), HvC.mean.sd.by.pval-HvC.se.sd.by.pval, c(1:5), HvC.mean.sd.by.pval+HvC.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(0,0,0,0.5))
#rna
sd.by.pvalue <- list()
for (i in 1:4){
sd.by.pvalue[[i]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$HvC.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$HvC.pval) <i*2) ]
}
sd.by.pvalue[[5]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$HvC.pval) > 8)]
HvC.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
HvC.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
HvC.se.sd.by.pval <- HvC.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
points(c(1:5),HvC.mean.sd.by.pval,pch = 19, cex = 0.5, col = rgb(1,0.5,0,0.5))
arrows(c(1:5), HvC.mean.sd.by.pval-HvC.se.sd.by.pval, c(1:5), HvC.mean.sd.by.pval+HvC.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(1,0.5,0,0.5))
legend("topleft", legend = c("Transcript","Protein"),col = c("orange","black"),pch = 19,cex = 0.5,bty = "n",xpd = TRUE)
######
# fig 3f
# pdf("../../figures/Fig3f.pdf", width = 4, height = 4)
#
# # HvC
# #SD
# sd.by.pvalue <- list()
# for (i in 1:4){
# sd.by.pvalue[[i]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$HvC.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$HvC.pval) <i*2) ]
# }
# sd.by.pvalue[[5]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$HvC.pval) > 8)]
#
# HvC.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
# HvC.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
# HvC.se.sd.by.pval <- HvC.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
#
# plot(c(1:5),HvC.mean.sd.by.pval, ylim = c(0,1),pch = 19,ylab = "standard deviation", xlab = ("-log10(p-value)"), main = "Variation in YRI expression for \n post-translationally buffered genes",xaxt="n", cex = 0.5, cex.main= 0.75, col = rgb(0,0,0,0.5))
# axis(1,at = (1:5),labels = c("0~2","2~4","4~6","6~8","8~"))
#
#
# arrows(c(1:5), HvC.mean.sd.by.pval-HvC.se.sd.by.pval, c(1:5), HvC.mean.sd.by.pval+HvC.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(0,0,0,0.5))
#
# #rna
#
# sd.by.pvalue <- list()
# for (i in 1:4){
# sd.by.pvalue[[i]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$HvC.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$HvC.pval) <i*2) ]
# }
# sd.by.pvalue[[5]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$HvC.pval) > 8)]
#
# HvC.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
# HvC.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
# HvC.se.sd.by.pval <- HvC.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
#
#
#
# points(c(1:5),HvC.mean.sd.by.pval,pch = 19, cex = 0.5, col = rgb(1,0.5,0,0.5))
#
# arrows(c(1:5), HvC.mean.sd.by.pval-HvC.se.sd.by.pval, c(1:5), HvC.mean.sd.by.pval+HvC.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(1,0.5,0,0.5))
#
#
# legend("topleft",legend = c("Transcript","Protein"),col = c("orange","black"),pch = 19,cex = 0.75,bty = "n",xpd = TRUE)
#
#
#
# dev.off()
######
## supplement
# RvC
#SD
sd.by.pvalue <- list()
for (i in 1:4){
sd.by.pvalue[[i]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$RvC.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$RvC.pval) <i*2) ]
}
sd.by.pvalue[[5]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$RvC.pval) > 8)]
RvC.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
RvC.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
RvC.se.sd.by.pval <- RvC.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
plot(c(1:5),RvC.mean.sd.by.pval, ylim = c(0,1),pch = 19,ylab = "standard deviation", xlab = ("-log10(p-value)"), main = "PT buffered genes",xaxt="n", cex = 0.5, col = rgb(0,0,0,0.5))
axis(1,at = (1:5),labels = c("0~2","2~4","4~6","6~8","8~"))
arrows(c(1:5), RvC.mean.sd.by.pval-RvC.se.sd.by.pval, c(1:5), RvC.mean.sd.by.pval+RvC.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(0,0,0,0.5))
#rna
sd.by.pvalue <- list()
for (i in 1:4){
sd.by.pvalue[[i]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$RvC.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$RvC.pval) <i*2) ]
}
sd.by.pvalue[[5]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$RvC.pval) > 8)]
RvC.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
RvC.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
RvC.se.sd.by.pval <- RvC.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
points(c(1:5),RvC.mean.sd.by.pval,pch = 19, cex = 0.5, col = rgb(1,0.5,0,0.5))
arrows(c(1:5), RvC.mean.sd.by.pval-RvC.se.sd.by.pval, c(1:5), RvC.mean.sd.by.pval+RvC.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(1,0.5,0,0.5))
######
# fig 3 S4a
# pdf("../../figures/Fig3S4a.pdf", width = 4, height = 4)
#
# # RvC
# #SD
# sd.by.pvalue <- list()
# for (i in 1:4){
# sd.by.pvalue[[i]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$RvC.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$RvC.pval) <i*2) ]
# }
# sd.by.pvalue[[5]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$RvC.pval) > 8)]
#
# RvC.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
# RvC.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
# RvC.se.sd.by.pval <- RvC.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
#
# plot(c(1:5),RvC.mean.sd.by.pval, ylim = c(0,1),pch = 19,ylab = "standard deviation", xlab = ("-log10(p-value)"), main = "RvC: Variation in YRI expression for \n post-translationally buffered genes",xaxt="n", cex = 0.5, cex.main= 0.75, col = rgb(0,0,0,0.5))
# axis(1,at = (1:5),labels = c("0~2","2~4","4~6","6~8","8~"))
#
#
# arrows(c(1:5), RvC.mean.sd.by.pval-RvC.se.sd.by.pval, c(1:5), RvC.mean.sd.by.pval+RvC.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(0,0,0,0.5))
#
# #rna
#
# sd.by.pvalue <- list()
# for (i in 1:4){
# sd.by.pvalue[[i]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$RvC.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$RvC.pval) <i*2) ]
# }
# sd.by.pvalue[[5]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$RvC.pval) > 8)]
#
# RvC.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
# RvC.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
# RvC.se.sd.by.pval <- RvC.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
#
#
#
# points(c(1:5),RvC.mean.sd.by.pval,pch = 19, cex = 0.5, col = rgb(1,0.5,0,0.5))
#
# arrows(c(1:5), RvC.mean.sd.by.pval-RvC.se.sd.by.pval, c(1:5), RvC.mean.sd.by.pval+RvC.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(1,0.5,0,0.5))
#
#
# legend("topleft",legend = c("Transcript","Protein"),col = c("orange","black"),pch = 19,cex = 0.75,bty = "n",xpd = TRUE)
#
#
#
# dev.off()
######
# RvH
#SD
sd.by.pvalue <- list()
for (i in 1:4){
sd.by.pvalue[[i]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$RvH.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$RvH.pval) <i*2) ]
}
sd.by.pvalue[[5]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$RvH.pval) > 8)]
RvH.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
RvH.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
RvH.se.sd.by.pval <- RvH.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
plot(c(1:5),RvH.mean.sd.by.pval, ylim = c(0,1),pch = 19,ylab = "standard deviation", xlab = ("-log10(p-value)"), main = "PT buffered genes",xaxt="n", cex = 0.5, col = rgb(0,0,0,0.5))
axis(1,at = (1:5),labels = c("0~2","2~4","4~6","6~8","8~"))
arrows(c(1:5), RvH.mean.sd.by.pval-RvH.se.sd.by.pval, c(1:5), RvH.mean.sd.by.pval+RvH.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(0,0,0,0.5))
#rna
sd.by.pvalue <- list()
for (i in 1:4){
sd.by.pvalue[[i]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$RvH.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$RvH.pval) <i*2) ]
}
sd.by.pvalue[[5]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$RvH.pval) > 8)]
RvH.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
RvH.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
RvH.se.sd.by.pval <- RvH.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
points(c(1:5),RvH.mean.sd.by.pval,pch = 19, cex = 0.5, col = rgb(1,0.5,0,0.5))
arrows(c(1:5), RvH.mean.sd.by.pval-RvH.se.sd.by.pval, c(1:5), RvH.mean.sd.by.pval+RvH.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(1,0.5,0,0.5))
######
# fig 3 S4b
# pdf("../../figures/Fig3S4b.pdf", width = 4, height = 4)
#
# # RvH
# #SD
# sd.by.pvalue <- list()
# for (i in 1:4){
# sd.by.pvalue[[i]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$RvH.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$RvH.pval) <i*2) ]
# }
# sd.by.pvalue[[5]] <- protein.buffering.results.YRI$protein.sd[which(-log10(protein.buffering.results.YRI$RvH.pval) > 8)]
#
# RvH.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
# RvH.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
# RvH.se.sd.by.pval <- RvH.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
#
# plot(c(1:5),RvH.mean.sd.by.pval, ylim = c(0,1),pch = 19,ylab = "standard deviation", xlab = ("-log10(p-value)"), main = "RvH: Variation in YRI expression for \n post-translationally buffered genes",xaxt="n", cex = 0.5, cex.main= 0.75, col = rgb(0,0,0,0.5))
# axis(1,at = (1:5),labels = c("0~2","2~4","4~6","6~8","8~"))
#
#
# arrows(c(1:5), RvH.mean.sd.by.pval-RvH.se.sd.by.pval, c(1:5), RvH.mean.sd.by.pval+RvH.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(0,0,0,0.5))
#
# #rna
#
# sd.by.pvalue <- list()
# for (i in 1:4){
# sd.by.pvalue[[i]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$RvH.pval) > 2*(i-1) & -log10(protein.buffering.results.YRI$RvH.pval) <i*2) ]
# }
# sd.by.pvalue[[5]] <- protein.buffering.results.YRI$RNA.sd[which(-log10(protein.buffering.results.YRI$RvH.pval) > 8)]
#
# RvH.mean.sd.by.pval <- unlist(lapply(sd.by.pvalue,mean))
# RvH.sd.sd.by.pval <- unlist(lapply(sd.by.pvalue,sd))
# RvH.se.sd.by.pval <- RvH.sd.sd.by.pval/sqrt(lengths(sd.by.pvalue))
#
#
#
# points(c(1:5),RvH.mean.sd.by.pval,pch = 19, cex = 0.5, col = rgb(1,0.5,0,0.5))
#
# arrows(c(1:5), RvH.mean.sd.by.pval-RvH.se.sd.by.pval, c(1:5), RvH.mean.sd.by.pval+RvH.se.sd.by.pval, length=0, angle=90, code=3, lwd = 4, col = rgb(1,0.5,0,0.5))
#
#
# legend("topleft",legend = c("Transcript","Protein"),col = c("orange","black"),pch = 19,cex = 0.75,bty = "n",xpd = TRUE)
#
#
#
# dev.off()
######
Subsample to match expression level with buffered genes
# for HvC
buff.hist <- hist(protein.buffering.results.YRI$RNA.mean[protein.buffering.results.YRI$HvC.pval < 0.01], breaks = 10, plot = FALSE)
buff.hist$breaks -> exp.intervals
buff.hist$density -> exp.intervals.prob
protein.buffering.results.YRI$prob <- 0
for (i in 1:length(exp.intervals.prob)) {
which(exp.intervals[i] < protein.buffering.results.YRI$RNA.mean & protein.buffering.results.YRI$RNA.mean < exp.intervals[i+1])-> interval.index
protein.buffering.results.YRI$prob[interval.index] <- exp.intervals.prob[i]/length(interval.index)
}
#####
#create index base on row number, used row name BAD idea, carry over from the original data frame
c(1:length(protein.buffering.results.YRI[,1]))->protein.buffering.results.YRI$index
sample(protein.buffering.results.YRI$index,size=400,replace=F, prob=protein.buffering.results.YRI$prob)->sampling.index
#plot results
# mean
boxplot(protein.buffering.results.YRI$RNA.mean[-log10(protein.buffering.results.YRI$HvC.pval) > 2], protein.buffering.results.YRI$RNA.mean[sampling.index] , outline=F,main="YRI RNA mean expression for buffered genes", ylab="log2 expression")
axis(1, 1:2, labels=c("buffered","non-buffered"))
boxplot(protein.buffering.results.YRI$RNA.sd[-log10(protein.buffering.results.YRI$HvC.pval) > 2], protein.buffering.results.YRI$RNA.sd[sampling.index], outline=F, notch=T ,main="YRI RNA expression sd for buffered genes", ylab="sd of log2 expression")
axis(1, 1:2, labels=c("buffered","non-buffered"))
#
plot(density(protein.buffering.results.YRI$RNA.sd[-log10(protein.buffering.results.YRI$HvC.pval) > 2]), main="YRI expression sd for PT buffered genes", xlab="sd of log2 mean expression", ylim=c(0,4))
sampling.index<-c()
sampling.sd.median<-c()
for (i in 1:1000){
sample(protein.buffering.results.YRI$index,size=400,replace=F, prob=protein.buffering.results.YRI$prob)->sampling.index
lines(density(protein.buffering.results.YRI$RNA.sd[sampling.index]), col="blue")
sampling.sd.median[i]<-median(protein.buffering.results.YRI$RNA.sd[sampling.index])
}
lines(density(protein.buffering.results.YRI$RNA.sd[-log10(protein.buffering.results.YRI$HvC.pval) > 2]), col="red", lwd=3)
legend("topright", c("buffer","nonbuffer"), lty=1 ,col=c("red","blue"))
boxplot(sampling.sd.median,ylim=c(0.35,0.45))
points(1,median(protein.buffering.results.YRI$RNA.sd[-log10(protein.buffering.results.YRI$HvC.pval) > 2]), col="red", pch=16)
# assume normal to get p value
pnorm(median(protein.buffering.results.YRI$RNA.sd[-log10(protein.buffering.results.YRI$HvC.pval) > 2]),mean=mean(sampling.sd.median),sd=sd(sampling.sd.median), lower.tail=F)
## [1] 1.083876e-07
# simply use the rank to estimate emperical p value
length(which(median(protein.buffering.results.YRI$RNA.sd[-log10(protein.buffering.results.YRI$HvC.pval) > 2]) < sampling.sd.median))+1/length(sampling.sd.median)
## [1] 0.001
######
# pdf("../../figures/Fig3S6a.pdf", width = 4, height = 4)
#
# plot(density(protein.buffering.results.YRI$RNA.sd[-log10(protein.buffering.results.YRI$HvC.pval) > 2]), main="YRI transcript level variation estimated from subsampling", cex.main=0.75, xlab="standard deviation", ylim=c(0,4), xaxt='n')
# axis(1,at = c(0,0.36,0.44,1,1.5),labels = c("0","","","1","1.5"))
# sampling.index<-c()
# sampling.sd.median<-c()
# for (i in 1:1000){
# sample(protein.buffering.results.YRI$index,size=400,replace=F, prob=protein.buffering.results.YRI$prob)->sampling.index
# lines(density(protein.buffering.results.YRI$RNA.sd[sampling.index]), col="dark blue")
# sampling.sd.median[i]<-median(protein.buffering.results.YRI$RNA.sd[sampling.index])
# }
# lines(density(protein.buffering.results.YRI$RNA.sd[-log10(protein.buffering.results.YRI$HvC.pval) > 2]), col="red", lwd=3)
# legend("topright", c("buffered","background"), lty=1, lwd = 2, bty = "n" ,col=c("red","dark blue"))
# dev.off()
######
######
# pdf("../../figures/Fig3S6b.pdf", width = 4, height = 4)
#
# boxplot(sampling.sd.median,ylim=c(0.35,0.45),yaxt="n",ylab="", cex.main=0.75, frame.plot = FALSE)
# axis(4)
# mtext('',4,line=2)
# points(1,median(protein.buffering.results.YRI$RNA.sd[-log10(protein.buffering.results.YRI$HvC.pval) > 2]), col="red", pch=16)
#
# dev.off()
######